OCR using automator

Question

ofranyut Author

Level 1

4 points

OCR using automator

How to make it more efficient and better?

#!/usr/bin/env python3

import os

import sys

import subprocess

import glob

import re

import easyocr

import cv2

import numpy as np

# Verificar argumentos

if len(sys.argv) < 2:

print("Uso: python3 ocr_global.py <ruta_del_pdf> [carpeta_de_salida]")

sys.exit(1)

# Obtener la ruta del PDF y la carpeta de salida

pdf_path = sys.argv[1]

output_folder = sys.argv[2] if len(sys.argv) > 2 else os.path.dirname(pdf_path)

basename = os.path.splitext(os.path.basename(pdf_path))[0]

output_prefix = os.path.join(output_folder, f"{basename}_ocr")

# Crear la carpeta de salida si no existe

os.makedirs(output_folder, exist_ok=True)

try:

# Generar TIFFs *** pdftoppm, forzando numeración a 300 DPI

result = subprocess.run(

["/usr/local/bin/pdftoppm", "-r", "300", "-tiff", "-forcenum", pdf_path, output_prefix],

check=True,

capture_output=True,

text=True

)

print("TIFFs generados:", result.stdout)

if result.stderr:

print("Advertencias/errores de pdftoppm:", result.stderr)

# Buscar todos los TIFFs generados dinámicamente (prueba .tif y .tiff)

tiff_pattern = f"{output_prefix}-*.tif" # Prioriza .tif basado en tu ejemplo

tiff_files = glob.glob(tiff_pattern)

if not tiff_files:

tiff_pattern = f"{output_prefix}-*.tiff" # Intenta *** .tiff si no hay .tif

tiff_files = glob.glob(tiff_pattern)

if not tiff_files:

print("Archivos encontrados para depuración:", glob.glob(f"{output_prefix}*"))

raise FileNotFoundError(f"No se encontraron TIFFs en {tiff_pattern}")

# Ordenar por número

tiff_files.sort(key=lambda x: int(re.search(r'-(\d+)', x).group(1)))

# Procesamiento de OCR *** EasyOCR para todos los TIFFs

reader = easyocr.Reader(['es']) # 'es' para español, ajusta según idioma

ocr_output_text = os.path.join(output_folder, "out.text")

with open(ocr_output_text, 'a', encoding='utf-8') as f:

f.write(f"\n=== Inicio de documento: {pdf_path} ===\n")

for i, tiff_path in enumerate(tiff_files, 1):

tiff_base = os.path.basename(tiff_path)

print(f"--- Documento: {tiff_base} | Página: {i} ---")

f.write(f"--- Documento: {tiff_base} | Página: {i} ---\n")

# Leer y preprocesar la imagen

img = cv2.imread(tiff_path, 0)

if img is None:

raise ValueError("No se pudo leer la imagen")

_, thresh = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

# Extraer texto *** EasyOCR

result = reader.readtext(thresh)

ocr_text = " ".join([text for _, text, _ in result]) # Unir *** espacios

print(ocr_text)

f.write(ocr_text + "\n")

print(f"--- Fin página {i} de {tiff_base} ---")

f.write(f"--- Fin página {i} de {tiff_base} ---\n")

f.write(f"=== Fin de documento: {pdf_path} ===\n")

# Borrar los TIFFs generados después de OCR

for tiff in tiff_files:

os.remove(tiff)

print("TIFFs borrados después de OCR.")

except subprocess.CalledProcessError as e:

print(f"Error al procesar el PDF: {e}")

print("Salida de error:", e.stderr)

except FileNotFoundError as e:

print(f"Error: {e}")

except Exception as e:

print(f"Error inesperado: {e}")

MacBook Pro 13″, macOS 13.7

Posted on Sep 29, 2025 8:53 AM

Reply

Answer 1

VikingOSX

Level 10

123,006 points

Sep 30, 2025 7:43 AM in response to Old Toad

VueScan can produce an OCR'd PDF if that option is chosen prior to the scan. It is not however, a post-processing feature after the PDF is produced.

Code Industry's Master PDF Editor can post process scanned PDFs and produce OCR'd results. Its price is now 80USD and considering how much time has been lost, and may be further lost on the presented Python code, that price may be a bargain considering the expense of lost time.

Reply

Answer 2

VikingOSX

Level 10

123,006 points

Sep 29, 2025 1:47 PM in response to ofranyut

You are on your own with Python3 and third-party Python packages. This is out of scope for the Apple Support Communities and you may want to post in Apple's Developer Forums.

Reply

Answer 3

VikingOSX

Level 10

123,006 points

Sep 29, 2025 1:50 PM in response to ofranyut

[redacted - duplicate]

Reply

Answer 4

Old Toad

Level 10

215,094 points

Sep 29, 2025 3:20 PM in response to ofranyut

Give VueScan a try. It has very good OCR capabilities.

Reply