from flask import Flask, request, jsonify
from PIL import Image
import pytesseract
import re

app = Flask(__name__)

def clean_and_format_text(raw_text):
    # إزالة الرموز غير الضرورية والأسطر الفارغة
    lines = raw_text.splitlines()
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        # تجاهل السطر إذا كان فارغاً أو يحتوي فقط على رموز غير مفيدة
        if line and not re.match(r'^[^\w\d\u0621-\u064A]+$', line):
            # إزالة الحروف غير المقروءة المتكررة
            line = re.sub(r'[^\w\d\u0621-\u064A\s]', '', line)
            cleaned_lines.append(line)

    return cleaned_lines

@app.route('/extract-id', methods=['POST'])
def extract_id_info():
    if 'image' not in request.files:
        return jsonify({"error": "No image uploaded"}), 400

    image = request.files['image']
    img = Image.open(image)

    try:
        raw_text = pytesseract.image_to_string(img, lang='ara+eng')
    except Exception as e:
        return jsonify({"error": f"OCR failed: {str(e)}"}), 500

    if not raw_text.strip():
        return jsonify({"error": "No text found in image"}), 400

    formatted_lines = clean_and_format_text(raw_text)

    if not formatted_lines:
        return jsonify({"error": "Text could not be parsed meaningfully"}), 400

    data = {
        "lines": formatted_lines
    }

    return jsonify(data)

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=8009, debug=True)
