I wanted to experiment with GPT-4 vision, reading an image and converting it to JSON for use with AI.
Here’s a concise summary of the project:
Project: OCR-AI-JSON
Goal: Create a system to extract structured data from receipt images using GPT-4 Vision and convert them to standardized JSON format using docling https://github.com/DS4SD/docling-core. With Docling, you can quickly structure and organize data within a document to maximize the efficiency of AI models in understanding and extracting valuable information.
Key Accomplishments:
Core Implementation
- Successful integration with GPT-4 Vision API
- Reliable text extraction from receipt images
- Conversion of unstructured text to structured JSON
- Implementation of both Docling and custom parsing solutions
Data Validation & Quality
- Built a comprehensive validation system for calculations
- Added metrics and analytics for receipt data
- Implemented error detection and reporting
- Established data quality guardrails
Architecture & Features
- Modular design with clear separation of concerns
- Robust error handling and logging
- Support for multiple output formats (text, markdown, JSON)
- Flexible parsing system with fallback options
Best Practices
- Secure API key management
- Comprehensive documentation
- Clean code structure
- Validation and error reporting
Future Potential:
- Production Deployment
- Containerization for deployment
- API endpoint creation
- Scalability considerations
2. Monitoring & Improvement
- Performance metrics tracking
- Error rate monitoring
- User feedback integration
- Continuous model improvement
3. Documentation & Training
- API Documentation
- Usage examples
- Integration guides
- Error handling documentation
The project successfully demonstrates the full pipeline from image to structured data, with proper validation and error handling, providing a foundation for production deployment.
# ocr-gpt4-v2.py
import base64
import requests
from pathlib import Path
from dataclasses import dataclass
from typing import Optional, Dict, Any, List, Tuple
from dotenv import load_dotenv
import os
import json
import tempfile
from datetime import datetime
from docling_core.types import DoclingDocument
import re
@dataclass
class OCRResponse:
text: str
markdown: str
json: Dict[str, Any]
docling_processed: bool = False
@property
def structured_data(self) -> Dict[str, Any]:
"""Get structured data from JSON response."""
return self.json.get("structured_data", {})
@property
def receipt_total(self) -> Optional[float]:
"""Get receipt total if available."""
receipt_data = self.structured_data.get("receipt", {})
total_data = receipt_data.get("total", {})
return total_data.get("value") if total_data else None
class LlamaOCR:
def __init__(self, api_key: str):
if not api_key:
raise ValueError("API key is required")
self.api_key = api_key
self.api_endpoint = "https://api.openai.com/v1/chat/completions"
def _image_to_base64(self, image_path: str) -> str:
"""Convert image file to base64 string."""
image_path = Path(image_path)
if not image_path.exists():
raise FileNotFoundError(f"Image file not found: {image_path}")
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def _clean_markdown(self, content: str) -> str:
"""Clean markdown content by removing backticks and markdown indicators."""
content = content.replace('```markdown', '').replace('```', '')
return content.strip()
def _parse_amount(self, text: str) -> Optional[float]:
"""Extract amount from text that contains a price."""
match = re.search(r'\$(\d+\.?\d*)', text)
if match:
return float(match.group(1))
return None
def _parse_item_line(self, line: str) -> Optional[Tuple[str, str, float]]:
"""Parse a line containing item and price."""
parts = re.split(r'\s{2,}|\t+', line)
if len(parts) >= 2:
item = parts[0].strip()
amount_str = parts[-1].strip()
amount = self._parse_amount(amount_str)
if amount is not None:
return item, amount_str, amount
return None
def _validate_receipt_calculations(self, receipt_data: Dict[str, Any]) -> Dict[str, Any]:
"""Validate receipt calculations and add validation metadata."""
validation_results = {
"is_valid": True,
"errors": [],
"warnings": []
}
items = receipt_data.get("items", [])
total_data = receipt_data.get("total")
subtotal_data = receipt_data.get("subtotal")
tax_data = receipt_data.get("tax")
# Calculate sum of items
items_total = sum(item.get("value", 0) for item in items)
# Validate total
if total_data and "value" in total_data:
declared_total = total_data["value"]
if abs(declared_total - items_total) > 0.01: # Allow for small rounding differences
validation_results["is_valid"] = False
validation_results["errors"].append({
"type": "total_mismatch",
"message": f"Total amount ({declared_total}) doesn't match sum of items ({items_total})",
"expected": items_total,
"found": declared_total
})
# Validate subtotal if present
if subtotal_data and "value" in subtotal_data:
if abs(subtotal_data["value"] - items_total) > 0.01:
validation_results["warnings"].append({
"type": "subtotal_mismatch",
"message": f"Subtotal ({subtotal_data['value']}) doesn't match sum of items ({items_total})"
})
# Validate tax calculations if both subtotal and total are present
if subtotal_data and total_data and tax_data:
expected_tax = total_data["value"] - subtotal_data["value"]
if abs(expected_tax - tax_data["value"]) > 0.01:
validation_results["warnings"].append({
"type": "tax_mismatch",
"message": f"Tax amount ({tax_data['value']}) doesn't match difference between total and subtotal ({expected_tax})"
})
return validation_results
def _calculate_receipt_metrics(self, receipt_data: Dict[str, Any]) -> Dict[str, Any]:
"""Calculate additional receipt metrics."""
items = receipt_data.get("items", [])
metrics = {
"item_count": len(items),
"average_item_price": 0,
"highest_item": None,
"lowest_item": None,
"total_before_tax": sum(item.get("value", 0) for item in items)
}
if items:
# Find highest and lowest priced items
sorted_items = sorted(items, key=lambda x: x.get("value", 0))
metrics["lowest_item"] = sorted_items[0]
metrics["highest_item"] = sorted_items[-1]
# Calculate average
if metrics["item_count"] > 0:
metrics["average_item_price"] = metrics["total_before_tax"] / metrics["item_count"]
return metrics
def _enhance_receipt_data(self, receipt_data: Dict[str, Any]) -> Dict[str, Any]:
"""Add validation and metrics to receipt data."""
validation = self._validate_receipt_calculations(receipt_data)
metrics = self._calculate_receipt_metrics(receipt_data)
enhanced_data = receipt_data.copy()
enhanced_data.update({
"validation": validation,
"metrics": metrics,
"processing_info": {
"timestamp": datetime.now().isoformat(),
"version": "1.0"
}
})
return enhanced_data
def _try_docling_conversion(self, markdown_content: str) -> tuple[Optional[Dict[str, Any]], bool]:
"""Attempt to convert markdown using Docling."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as temp_file:
temp_file.write(markdown_content)
temp_path = temp_file.name
try:
print("\nAttempting Docling conversion...")
doc = DoclingDocument.from_markdown(temp_path)
docling_json = doc.to_dict()
print("✓ Docling conversion successful")
return docling_json, True
except Exception as e:
print(f"✗ Docling conversion failed: {e}")
return None, False
finally:
Path(temp_path).unlink()
def _fallback_conversion(self, markdown_content: str) -> Dict[str, Any]:
"""Enhanced fallback conversion using custom parser."""
print("\nUsing enhanced fallback markdown parser...")
content = self._clean_markdown(markdown_content)
lines = [line.strip() for line in content.split('\n') if line.strip()]
result = {
"type": "document",
"content": [],
"metadata": {
"source": "ocr",
"format": "markdown",
"processed_at": datetime.now().isoformat(),
"processor": "custom_fallback"
},
"structured_data": {
"receipt": {
"title": None,
"items": [],
"total": None,
"subtotal": None,
"tax": None,
"currency": "USD"
}
}
}
current_section = "header"
for line in lines:
if all(c in '-' for c in line if c.strip()):
continue
if current_section == "header" and not any(c in line for c in ['$', '|']):
result["structured_data"]["receipt"]["title"] = line
result["content"].append({
"type": "header",
"content": line,
"level": 1
})
current_section = "items"
continue
parsed = self._parse_item_line(line)
if parsed:
item, amount_str, amount = parsed
if 'total' in item.lower():
result["structured_data"]["receipt"]["total"] = {
"label": item.strip(),
"amount": amount_str,
"value": amount
}
elif 'subtotal' in item.lower():
result["structured_data"]["receipt"]["subtotal"] = {
"label": item.strip(),
"amount": amount_str,
"value": amount
}
elif 'tax' in item.lower():
result["structured_data"]["receipt"]["tax"] = {
"label": item.strip(),
"amount": amount_str,
"value": amount
}
else:
item_data = {
"item": item.strip(),
"amount": amount_str,
"value": amount
}
result["structured_data"]["receipt"]["items"].append(item_data)
result["content"].append({
"type": "line_item",
"item": item.strip(),
"amount": amount_str,
"value": amount
})
# Add validation and metrics
if "receipt" in result["structured_data"]:
result["structured_data"]["receipt"] = self._enhance_receipt_data(
result["structured_data"]["receipt"]
)
return result
def process_image(self, image_path: str) -> OCRResponse:
"""Process an image and extract text using GPT-4 Vision."""
try:
base64_image = self._image_to_base64(image_path)
payload = {
"model": "gpt-4o", # Using the new recommended model
"messages": [
{
"role": "system",
"content": "You are a helpful assistant that extracts text from images and formats it properly in markdown."
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "Please extract all text from this image and format it in markdown, preserving any structure like tables, lists, and headers. Return only the extracted text without any explanations."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high"
}
}
]
}
],
"max_tokens": 1000,
"temperature": 0
}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
print("Sending request to OpenAI API...")
print(f"Using model: {payload['model']}")
response = requests.post(
self.api_endpoint,
json=payload,
headers=headers
)
print(f"Response status: {response.status_code}")
if response.status_code != 200:
print(f"Error Response: {response.text}")
response.raise_for_status()
response_data = response.json()
content = response_data["choices"][0]["message"]["content"]
json_content, used_docling = self._try_docling_conversion(content)
if not json_content:
json_content = self._fallback_conversion(content)
return OCRResponse(
text=content,
markdown=content,
json=json_content,
docling_processed=used_docling
)
except Exception as e:
print(f"Error: {str(e)}")
raise
def save_outputs(result: OCRResponse, base_dir: str = "output") -> None:
"""Save outputs to files with error handling."""
try:
output_dir = Path(base_dir)
output_dir.mkdir(exist_ok=True)
# Add processing info to JSON
if isinstance(result.json, dict):
result.json["metadata"] = {
**result.json.get("metadata", {}),
"processed_at": datetime.now().isoformat(),
"processed_by": "docling" if result.docling_processed else "custom_parser"
}
# Save text output
with open(output_dir / "output.txt", "w", encoding="utf-8") as f:
f.write(result.text)
# Save markdown output
with open(output_dir / "output.md", "w", encoding="utf-8") as f:
f.write(result.markdown)
# Save JSON output
with open(output_dir / "output.json", "w", encoding="utf-8") as f:
json.dump(result.json, f, indent=2, ensure_ascii=False)
# Print summary
print(f"\nOutputs saved to {output_dir} directory")
print(f"Processing method: {'Docling' if result.docling_processed else 'Custom parser'}")
# Print receipt summary if available
receipt_data = result.structured_data.get("receipt", {})
if receipt_data:
print("\nReceipt Summary:")
print(f"Title: {receipt_data.get('title')}")
validation = receipt_data.get("validation", {})
if validation:
print("\nValidation Results:")
print(f"Valid: {'✓' if validation.get('is_valid', True) else '✗'}")
if validation.get("errors"):
print("\nErrors:")
for error in validation["errors"]:
print(f" ✗ {error['message']}")
if validation.get("warnings"):
print("\nWarnings:")
for warning in validation["warnings"]:
print(f" ⚠ {warning['message']}")
metrics = receipt_data.get("metrics", {})
if metrics:
print("\nMetrics:")
print(f"Item Count: {metrics['item_count']}")
print(f"Average Price: ${metrics['average_item_price']:.2f}")
if metrics.get('highest_item'):
print(f"Highest Item: {metrics['highest_item']['item']} (${metrics['highest_item']['value']:.2f})")
if metrics.get('lowest_item'):
print(f"Lowest Item: {metrics['lowest_item']['item']} (${metrics['lowest_item']['value']:.2f})")
print("\nItems:")
for item in receipt_data.get('items', []):
print(f" - {item['item']}: {item['amount']}")
if receipt_data.get('total'):
print(f"\nTotal: {receipt_data['total']['amount']}")
except Exception as e:
print(f"Error saving outputs: {e}")
raise
def main():
"""Main execution function with enhanced error handling."""
# Get the project root directory (where .env is located)
project_root = Path('/Users/mikewahl/Library/CloudStorage/OneDrive-TweddleGroupInc/Documents/projects/ocr-ai-json')
env_path = project_root / '.env'
print(f"Looking for .env at: {env_path}")
print(f".env exists: {env_path.exists()}")
# Load environment variables from the specific path
load_dotenv(dotenv_path=env_path)
# Get API key
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
print("Error: OPENAI_API_KEY not found in environment variables")
print("Please set your OpenAI API key using one of these methods:")
print("1. Export it in your terminal: export OPENAI_API_KEY='your-key'")
print("2. Add it to a .env file: OPENAI_API_KEY=your-key")
return
try:
# Initialize OCR
ocr = LlamaOCR(api_key=api_key)
# Process image
result = ocr.process_image("sample_images/sample_receipt.jpg")
# Print results
print("\nExtracted Text:")
print(result.text)
print("\nMarkdown Format:")
print(result.markdown)
print("\nJSON Format:")
print(json.dumps(result.json, indent=2))
# Save outputs
save_outputs(result)
except FileNotFoundError as e:
print(f"File not found error: {e}")
except requests.exceptions.RequestException as e:
print(f"API request error: {e}")
except json.JSONDecodeError as e:
print(f"JSON parsing error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
if __name__ == "__main__":
main()