helper scripts
This commit is contained in:
Executable
+169
@@ -0,0 +1,169 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import glob
|
||||
import os
|
||||
import sqlite3
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
|
||||
# Configuration
|
||||
SCREENSHOTS_DIR = os.path.expanduser("~/Screenshots")
|
||||
DATABASE_PATH = os.path.expanduser("~/screenshot_ocr.db")
|
||||
|
||||
|
||||
def create_database():
|
||||
"""Create SQLite database and table if they don't exist."""
|
||||
conn = sqlite3.connect(DATABASE_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Create table for OCR results
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS ocr_results (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
filename TEXT UNIQUE,
|
||||
full_path TEXT,
|
||||
ocr_text TEXT,
|
||||
file_size INTEGER,
|
||||
created_date TEXT,
|
||||
ocr_date TEXT
|
||||
)
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print(f"Database initialized at {DATABASE_PATH}")
|
||||
|
||||
|
||||
def get_processed_files():
|
||||
"""Get a set of filenames that have already been processed."""
|
||||
conn = sqlite3.connect(DATABASE_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT filename FROM ocr_results")
|
||||
processed_files = {row[0] for row in cursor.fetchall()}
|
||||
|
||||
conn.close()
|
||||
return processed_files
|
||||
|
||||
|
||||
def perform_ocr(image_path):
|
||||
"""Perform OCR on an image file using tesseract."""
|
||||
try:
|
||||
# Create a temporary output file
|
||||
temp_output = f"/tmp/{os.path.basename(image_path)}.txt"
|
||||
temp_base = temp_output.replace(".txt", "")
|
||||
|
||||
# Run tesseract
|
||||
subprocess.run(
|
||||
["tesseract", image_path, temp_base],
|
||||
check=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
|
||||
# Read OCR text from the output file
|
||||
with open(temp_output, "r", encoding="utf-8") as f:
|
||||
ocr_text = f.read().strip()
|
||||
|
||||
# Clean up temporary file
|
||||
os.remove(temp_output)
|
||||
|
||||
return ocr_text
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error running tesseract on {image_path}: {str(e)}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
print(f"Error processing {image_path}: {str(e)}")
|
||||
return ""
|
||||
|
||||
|
||||
def add_to_database(filename, full_path, ocr_text, file_size, created_date):
|
||||
"""Add OCR result to the database."""
|
||||
conn = sqlite3.connect(DATABASE_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO ocr_results
|
||||
(filename, full_path, ocr_text, file_size, created_date, ocr_date)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
filename,
|
||||
full_path,
|
||||
ocr_text,
|
||||
file_size,
|
||||
created_date,
|
||||
datetime.now().isoformat(),
|
||||
),
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
print(f"Added {filename} to database")
|
||||
except sqlite3.IntegrityError:
|
||||
print(f"File {filename} already exists in database")
|
||||
except Exception as e:
|
||||
print(f"Error adding {filename} to database: {str(e)}")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to process screenshot images."""
|
||||
print("Starting OCR process for screenshots...")
|
||||
|
||||
# Create database if it doesn't exist
|
||||
create_database()
|
||||
|
||||
# Get list of already processed files
|
||||
processed_files = get_processed_files()
|
||||
print(f"Found {len(processed_files)} already processed files")
|
||||
|
||||
# Get list of PNG and JPG files
|
||||
image_files = glob.glob(os.path.join(SCREENSHOTS_DIR, "*.png"))
|
||||
image_files.extend(glob.glob(os.path.join(SCREENSHOTS_DIR, "*.jpg")))
|
||||
image_files.extend(glob.glob(os.path.join(SCREENSHOTS_DIR, "*.jpeg")))
|
||||
print(f"Found {len(image_files)} image files")
|
||||
|
||||
# Process each image file
|
||||
processed_count = 0
|
||||
skipped_count = 0
|
||||
error_count = 0
|
||||
|
||||
for image_path in image_files:
|
||||
filename = os.path.basename(image_path)
|
||||
|
||||
# Skip if already processed
|
||||
if filename in processed_files:
|
||||
print(f"Skipping {filename} (already processed)")
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
print(f"Processing {filename}...")
|
||||
|
||||
# Get file information
|
||||
file_stats = os.stat(image_path)
|
||||
file_size = file_stats.st_size
|
||||
created_date = datetime.fromtimestamp(file_stats.st_mtime).isoformat()
|
||||
|
||||
# Perform OCR
|
||||
ocr_text = perform_ocr(image_path)
|
||||
|
||||
if ocr_text:
|
||||
# Add to database
|
||||
add_to_database(filename, image_path, ocr_text, file_size, created_date)
|
||||
processed_count += 1
|
||||
else:
|
||||
print(f"No OCR text extracted from {filename}")
|
||||
error_count += 1
|
||||
|
||||
print("\nOCR process completed:")
|
||||
print(f"- Processed: {processed_count}")
|
||||
print(f"- Skipped (already in database): {skipped_count}")
|
||||
print(f"- Errors: {error_count}")
|
||||
print(f"- Total files in database: {len(processed_files) + processed_count}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user