I wanted to experiment with GPT-4 vision, reading an image and converting it to JSON for use with AI.

Michael Wahl
6 min readNov 15, 2024

--

Here’s a concise summary of the project:

Project: OCR-AI-JSON

Goal: Create a system to extract structured data from receipt images using GPT-4 Vision and convert them to standardized JSON format using docling https://github.com/DS4SD/docling-core. With Docling, you can quickly structure and organize data within a document to maximize the efficiency of AI models in understanding and extracting valuable information.

Key Accomplishments:

Core Implementation

  • Successful integration with GPT-4 Vision API
  • Reliable text extraction from receipt images
  • Conversion of unstructured text to structured JSON
  • Implementation of both Docling and custom parsing solutions

Data Validation & Quality

  • Built a comprehensive validation system for calculations
  • Added metrics and analytics for receipt data
  • Implemented error detection and reporting
  • Established data quality guardrails

Architecture & Features

  • Modular design with clear separation of concerns
  • Robust error handling and logging
  • Support for multiple output formats (text, markdown, JSON)
  • Flexible parsing system with fallback options

Best Practices

  • Secure API key management
  • Comprehensive documentation
  • Clean code structure
  • Validation and error reporting

Future Potential:

  1. Production Deployment
  • Containerization for deployment
  • API endpoint creation
  • Scalability considerations

2. Monitoring & Improvement

  • Performance metrics tracking
  • Error rate monitoring
  • User feedback integration
  • Continuous model improvement

3. Documentation & Training

  • API Documentation
  • Usage examples
  • Integration guides
  • Error handling documentation

The project successfully demonstrates the full pipeline from image to structured data, with proper validation and error handling, providing a foundation for production deployment.​​​​​​​​​​​​​​​​


# ocr-gpt4-v2.py
import base64
import requests
from pathlib import Path
from dataclasses import dataclass
from typing import Optional, Dict, Any, List, Tuple
from dotenv import load_dotenv
import os
import json
import tempfile
from datetime import datetime
from docling_core.types import DoclingDocument
import re

@dataclass
class OCRResponse:
text: str
markdown: str
json: Dict[str, Any]
docling_processed: bool = False

@property
def structured_data(self) -> Dict[str, Any]:
"""Get structured data from JSON response."""
return self.json.get("structured_data", {})

@property
def receipt_total(self) -> Optional[float]:
"""Get receipt total if available."""
receipt_data = self.structured_data.get("receipt", {})
total_data = receipt_data.get("total", {})
return total_data.get("value") if total_data else None

class LlamaOCR:
def __init__(self, api_key: str):
if not api_key:
raise ValueError("API key is required")
self.api_key = api_key
self.api_endpoint = "https://api.openai.com/v1/chat/completions"

def _image_to_base64(self, image_path: str) -> str:
"""Convert image file to base64 string."""
image_path = Path(image_path)
if not image_path.exists():
raise FileNotFoundError(f"Image file not found: {image_path}")

with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')

def _clean_markdown(self, content: str) -> str:
"""Clean markdown content by removing backticks and markdown indicators."""
content = content.replace('```markdown', '').replace('```', '')
return content.strip()

def _parse_amount(self, text: str) -> Optional[float]:
"""Extract amount from text that contains a price."""
match = re.search(r'\$(\d+\.?\d*)', text)
if match:
return float(match.group(1))
return None

def _parse_item_line(self, line: str) -> Optional[Tuple[str, str, float]]:
"""Parse a line containing item and price."""
parts = re.split(r'\s{2,}|\t+', line)
if len(parts) >= 2:
item = parts[0].strip()
amount_str = parts[-1].strip()
amount = self._parse_amount(amount_str)
if amount is not None:
return item, amount_str, amount
return None

def _validate_receipt_calculations(self, receipt_data: Dict[str, Any]) -> Dict[str, Any]:
"""Validate receipt calculations and add validation metadata."""
validation_results = {
"is_valid": True,
"errors": [],
"warnings": []
}

items = receipt_data.get("items", [])
total_data = receipt_data.get("total")
subtotal_data = receipt_data.get("subtotal")
tax_data = receipt_data.get("tax")

# Calculate sum of items
items_total = sum(item.get("value", 0) for item in items)

# Validate total
if total_data and "value" in total_data:
declared_total = total_data["value"]
if abs(declared_total - items_total) > 0.01: # Allow for small rounding differences
validation_results["is_valid"] = False
validation_results["errors"].append({
"type": "total_mismatch",
"message": f"Total amount ({declared_total}) doesn't match sum of items ({items_total})",
"expected": items_total,
"found": declared_total
})

# Validate subtotal if present
if subtotal_data and "value" in subtotal_data:
if abs(subtotal_data["value"] - items_total) > 0.01:
validation_results["warnings"].append({
"type": "subtotal_mismatch",
"message": f"Subtotal ({subtotal_data['value']}) doesn't match sum of items ({items_total})"
})

# Validate tax calculations if both subtotal and total are present
if subtotal_data and total_data and tax_data:
expected_tax = total_data["value"] - subtotal_data["value"]
if abs(expected_tax - tax_data["value"]) > 0.01:
validation_results["warnings"].append({
"type": "tax_mismatch",
"message": f"Tax amount ({tax_data['value']}) doesn't match difference between total and subtotal ({expected_tax})"
})

return validation_results

def _calculate_receipt_metrics(self, receipt_data: Dict[str, Any]) -> Dict[str, Any]:
"""Calculate additional receipt metrics."""
items = receipt_data.get("items", [])

metrics = {
"item_count": len(items),
"average_item_price": 0,
"highest_item": None,
"lowest_item": None,
"total_before_tax": sum(item.get("value", 0) for item in items)
}

if items:
# Find highest and lowest priced items
sorted_items = sorted(items, key=lambda x: x.get("value", 0))
metrics["lowest_item"] = sorted_items[0]
metrics["highest_item"] = sorted_items[-1]

# Calculate average
if metrics["item_count"] > 0:
metrics["average_item_price"] = metrics["total_before_tax"] / metrics["item_count"]

return metrics

def _enhance_receipt_data(self, receipt_data: Dict[str, Any]) -> Dict[str, Any]:
"""Add validation and metrics to receipt data."""
validation = self._validate_receipt_calculations(receipt_data)
metrics = self._calculate_receipt_metrics(receipt_data)

enhanced_data = receipt_data.copy()
enhanced_data.update({
"validation": validation,
"metrics": metrics,
"processing_info": {
"timestamp": datetime.now().isoformat(),
"version": "1.0"
}
})

return enhanced_data

def _try_docling_conversion(self, markdown_content: str) -> tuple[Optional[Dict[str, Any]], bool]:
"""Attempt to convert markdown using Docling."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as temp_file:
temp_file.write(markdown_content)
temp_path = temp_file.name

try:
print("\nAttempting Docling conversion...")
doc = DoclingDocument.from_markdown(temp_path)
docling_json = doc.to_dict()
print("✓ Docling conversion successful")
return docling_json, True
except Exception as e:
print(f"✗ Docling conversion failed: {e}")
return None, False
finally:
Path(temp_path).unlink()

def _fallback_conversion(self, markdown_content: str) -> Dict[str, Any]:
"""Enhanced fallback conversion using custom parser."""
print("\nUsing enhanced fallback markdown parser...")
content = self._clean_markdown(markdown_content)
lines = [line.strip() for line in content.split('\n') if line.strip()]

result = {
"type": "document",
"content": [],
"metadata": {
"source": "ocr",
"format": "markdown",
"processed_at": datetime.now().isoformat(),
"processor": "custom_fallback"
},
"structured_data": {
"receipt": {
"title": None,
"items": [],
"total": None,
"subtotal": None,
"tax": None,
"currency": "USD"
}
}
}

current_section = "header"
for line in lines:
if all(c in '-' for c in line if c.strip()):
continue

if current_section == "header" and not any(c in line for c in ['$', '|']):
result["structured_data"]["receipt"]["title"] = line
result["content"].append({
"type": "header",
"content": line,
"level": 1
})
current_section = "items"
continue

parsed = self._parse_item_line(line)
if parsed:
item, amount_str, amount = parsed

if 'total' in item.lower():
result["structured_data"]["receipt"]["total"] = {
"label": item.strip(),
"amount": amount_str,
"value": amount
}
elif 'subtotal' in item.lower():
result["structured_data"]["receipt"]["subtotal"] = {
"label": item.strip(),
"amount": amount_str,
"value": amount
}
elif 'tax' in item.lower():
result["structured_data"]["receipt"]["tax"] = {
"label": item.strip(),
"amount": amount_str,
"value": amount
}
else:
item_data = {
"item": item.strip(),
"amount": amount_str,
"value": amount
}
result["structured_data"]["receipt"]["items"].append(item_data)
result["content"].append({
"type": "line_item",
"item": item.strip(),
"amount": amount_str,
"value": amount
})

# Add validation and metrics
if "receipt" in result["structured_data"]:
result["structured_data"]["receipt"] = self._enhance_receipt_data(
result["structured_data"]["receipt"]
)

return result

def process_image(self, image_path: str) -> OCRResponse:
"""Process an image and extract text using GPT-4 Vision."""
try:
base64_image = self._image_to_base64(image_path)

payload = {
"model": "gpt-4o", # Using the new recommended model
"messages": [
{
"role": "system",
"content": "You are a helpful assistant that extracts text from images and formats it properly in markdown."
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "Please extract all text from this image and format it in markdown, preserving any structure like tables, lists, and headers. Return only the extracted text without any explanations."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high"
}
}
]
}
],
"max_tokens": 1000,
"temperature": 0
}

headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}

print("Sending request to OpenAI API...")
print(f"Using model: {payload['model']}")

response = requests.post(
self.api_endpoint,
json=payload,
headers=headers
)

print(f"Response status: {response.status_code}")

if response.status_code != 200:
print(f"Error Response: {response.text}")
response.raise_for_status()

response_data = response.json()
content = response_data["choices"][0]["message"]["content"]

json_content, used_docling = self._try_docling_conversion(content)
if not json_content:
json_content = self._fallback_conversion(content)

return OCRResponse(
text=content,
markdown=content,
json=json_content,
docling_processed=used_docling
)

except Exception as e:
print(f"Error: {str(e)}")
raise
def save_outputs(result: OCRResponse, base_dir: str = "output") -> None:
"""Save outputs to files with error handling."""
try:
output_dir = Path(base_dir)
output_dir.mkdir(exist_ok=True)

# Add processing info to JSON
if isinstance(result.json, dict):
result.json["metadata"] = {
**result.json.get("metadata", {}),
"processed_at": datetime.now().isoformat(),
"processed_by": "docling" if result.docling_processed else "custom_parser"
}

# Save text output
with open(output_dir / "output.txt", "w", encoding="utf-8") as f:
f.write(result.text)

# Save markdown output
with open(output_dir / "output.md", "w", encoding="utf-8") as f:
f.write(result.markdown)

# Save JSON output
with open(output_dir / "output.json", "w", encoding="utf-8") as f:
json.dump(result.json, f, indent=2, ensure_ascii=False)

# Print summary
print(f"\nOutputs saved to {output_dir} directory")
print(f"Processing method: {'Docling' if result.docling_processed else 'Custom parser'}")

# Print receipt summary if available
receipt_data = result.structured_data.get("receipt", {})
if receipt_data:
print("\nReceipt Summary:")
print(f"Title: {receipt_data.get('title')}")

validation = receipt_data.get("validation", {})
if validation:
print("\nValidation Results:")
print(f"Valid: {'✓' if validation.get('is_valid', True) else '✗'}")

if validation.get("errors"):
print("\nErrors:")
for error in validation["errors"]:
print(f" ✗ {error['message']}")

if validation.get("warnings"):
print("\nWarnings:")
for warning in validation["warnings"]:
print(f" ⚠ {warning['message']}")

metrics = receipt_data.get("metrics", {})
if metrics:
print("\nMetrics:")
print(f"Item Count: {metrics['item_count']}")
print(f"Average Price: ${metrics['average_item_price']:.2f}")
if metrics.get('highest_item'):
print(f"Highest Item: {metrics['highest_item']['item']} (${metrics['highest_item']['value']:.2f})")
if metrics.get('lowest_item'):
print(f"Lowest Item: {metrics['lowest_item']['item']} (${metrics['lowest_item']['value']:.2f})")

print("\nItems:")
for item in receipt_data.get('items', []):
print(f" - {item['item']}: {item['amount']}")

if receipt_data.get('total'):
print(f"\nTotal: {receipt_data['total']['amount']}")

except Exception as e:
print(f"Error saving outputs: {e}")
raise

def main():
"""Main execution function with enhanced error handling."""
# Get the project root directory (where .env is located)
project_root = Path('/Users/mikewahl/Library/CloudStorage/OneDrive-TweddleGroupInc/Documents/projects/ocr-ai-json')
env_path = project_root / '.env'

print(f"Looking for .env at: {env_path}")
print(f".env exists: {env_path.exists()}")

# Load environment variables from the specific path
load_dotenv(dotenv_path=env_path)

# Get API key
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
print("Error: OPENAI_API_KEY not found in environment variables")
print("Please set your OpenAI API key using one of these methods:")
print("1. Export it in your terminal: export OPENAI_API_KEY='your-key'")
print("2. Add it to a .env file: OPENAI_API_KEY=your-key")
return

try:
# Initialize OCR
ocr = LlamaOCR(api_key=api_key)

# Process image
result = ocr.process_image("sample_images/sample_receipt.jpg")

# Print results
print("\nExtracted Text:")
print(result.text)
print("\nMarkdown Format:")
print(result.markdown)
print("\nJSON Format:")
print(json.dumps(result.json, indent=2))

# Save outputs
save_outputs(result)

except FileNotFoundError as e:
print(f"File not found error: {e}")
except requests.exceptions.RequestException as e:
print(f"API request error: {e}")
except json.JSONDecodeError as e:
print(f"JSON parsing error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")

if __name__ == "__main__":
main()

--

--

Michael Wahl
Michael Wahl

Written by Michael Wahl

Husband | Dad | VP of IT | MBA | Author | AI | #AWSCommunityBuilder | Opinions expressed here are my own | https://cv.michaelwahl.org

No responses yet