OCR using automator

How to make it more efficient and better?



#!/usr/bin/env python3


import os

import sys

import subprocess

import glob

import re

import easyocr

import cv2

import numpy as np


# Verificar argumentos

if len(sys.argv) < 2:

print("Uso: python3 ocr_global.py <ruta_del_pdf> [carpeta_de_salida]")

sys.exit(1)


# Obtener la ruta del PDF y la carpeta de salida

pdf_path = sys.argv[1]

output_folder = sys.argv[2] if len(sys.argv) > 2 else os.path.dirname(pdf_path)

basename = os.path.splitext(os.path.basename(pdf_path))[0]

output_prefix = os.path.join(output_folder, f"{basename}_ocr")


# Crear la carpeta de salida si no existe

os.makedirs(output_folder, exist_ok=True)


try:

# Generar TIFFs *** pdftoppm, forzando numeración a 300 DPI

result = subprocess.run(

["/usr/local/bin/pdftoppm", "-r", "300", "-tiff", "-forcenum", pdf_path, output_prefix],

check=True,

capture_output=True,

text=True

)

print("TIFFs generados:", result.stdout)

if result.stderr:

print("Advertencias/errores de pdftoppm:", result.stderr)


# Buscar todos los TIFFs generados dinámicamente (prueba .tif y .tiff)

tiff_pattern = f"{output_prefix}-*.tif" # Prioriza .tif basado en tu ejemplo

tiff_files = glob.glob(tiff_pattern)

if not tiff_files:

tiff_pattern = f"{output_prefix}-*.tiff" # Intenta *** .tiff si no hay .tif

tiff_files = glob.glob(tiff_pattern)

if not tiff_files:

print("Archivos encontrados para depuración:", glob.glob(f"{output_prefix}*"))

raise FileNotFoundError(f"No se encontraron TIFFs en {tiff_pattern}")


# Ordenar por número

tiff_files.sort(key=lambda x: int(re.search(r'-(\d+)', x).group(1)))


# Procesamiento de OCR *** EasyOCR para todos los TIFFs

reader = easyocr.Reader(['es']) # 'es' para español, ajusta según idioma

ocr_output_text = os.path.join(output_folder, "out.text")

with open(ocr_output_text, 'a', encoding='utf-8') as f:

f.write(f"\n=== Inicio de documento: {pdf_path} ===\n")

for i, tiff_path in enumerate(tiff_files, 1):

tiff_base = os.path.basename(tiff_path)

print(f"--- Documento: {tiff_base} | Página: {i} ---")

f.write(f"--- Documento: {tiff_base} | Página: {i} ---\n")

# Leer y preprocesar la imagen

img = cv2.imread(tiff_path, 0)

if img is None:

raise ValueError("No se pudo leer la imagen")

_, thresh = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

# Extraer texto *** EasyOCR

result = reader.readtext(thresh)

ocr_text = " ".join([text for _, text, _ in result]) # Unir *** espacios

print(ocr_text)

f.write(ocr_text + "\n")

print(f"--- Fin página {i} de {tiff_base} ---")

f.write(f"--- Fin página {i} de {tiff_base} ---\n")

f.write(f"=== Fin de documento: {pdf_path} ===\n")


# Borrar los TIFFs generados después de OCR

for tiff in tiff_files:

os.remove(tiff)

print("TIFFs borrados después de OCR.")


except subprocess.CalledProcessError as e:

print(f"Error al procesar el PDF: {e}")

print("Salida de error:", e.stderr)

except FileNotFoundError as e:

print(f"Error: {e}")

except Exception as e:

print(f"Error inesperado: {e}")

MacBook Pro 13″, macOS 13.7

Posted on Sep 29, 2025 8:53 AM

Reply
4 replies

Sep 30, 2025 7:43 AM in response to Old Toad

VueScan can produce an OCR'd PDF if that option is chosen prior to the scan. It is not however, a post-processing feature after the PDF is produced.


Code Industry's Master PDF Editor can post process scanned PDFs and produce OCR'd results. Its price is now 80USD and considering how much time has been lost, and may be further lost on the presented Python code, that price may be a bargain considering the expense of lost time.

This thread has been closed by the system or the community team. You may vote for any posts you find helpful, or search the Community for additional answers.

OCR using automator

Welcome to Apple Support Community
A forum where Apple customers help each other with their products. Get started with your Apple Account.