I wanted to experiment with GPT-4 vision, reading an image and converting it to JSON for use with AI.

6 min readNov 15, 2024

Here’s a concise summary of the project:

Project: OCR-AI-JSON

Goal: Create a system to extract structured data from receipt images using GPT-4 Vision and convert them to standardized JSON format using docling https://github.com/DS4SD/docling-core. With Docling, you can quickly structure and organize data within a document to maximize the efficiency of AI models in understanding and extracting valuable information.

Key Accomplishments:

Core Implementation

Successful integration with GPT-4 Vision API
Reliable text extraction from receipt images
Conversion of unstructured text to structured JSON
Implementation of both Docling and custom parsing solutions

Data Validation & Quality

Built a comprehensive validation system for calculations
Added metrics and analytics for receipt data
Implemented error detection and reporting
Established data quality guardrails

Architecture & Features

Modular design with clear separation of concerns
Robust error handling and logging
Support for multiple output formats (text, markdown, JSON)
Flexible parsing system with fallback options

Best Practices

Secure API key management
Comprehensive documentation
Clean code structure
Validation and error reporting

Future Potential:

Production Deployment

Containerization for deployment
API endpoint creation
Scalability considerations

2. Monitoring & Improvement

Performance metrics tracking
Error rate monitoring
User feedback integration
Continuous model improvement

3. Documentation & Training

API Documentation
Usage examples
Integration guides
Error handling documentation

The project successfully demonstrates the full pipeline from image to structured data, with proper validation and error handling, providing a foundation for production deployment.


# ocr-gpt4-v2.py
import base64
import requests
from pathlib import Path
from dataclasses import dataclass
from typing import Optional, Dict, Any, List, Tuple
from dotenv import load_dotenv
import os
import json
import tempfile
from datetime import datetime
from docling_core.types import DoclingDocument
import re

@dataclass
class OCRResponse:
    text: str
    markdown: str
    json: Dict[str, Any]
    docling_processed: bool = False

    @property
    def structured_data(self) -> Dict[str, Any]:
        """Get structured data from JSON response."""
        return self.json.get("structured_data", {})

    @property
    def receipt_total(self) -> Optional[float]:
        """Get receipt total if available."""
        receipt_data = self.structured_data.get("receipt", {})
        total_data = receipt_data.get("total", {})
        return total_data.get("value") if total_data else None

class LlamaOCR:
    def __init__(self, api_key: str):
        if not api_key:
            raise ValueError("API key is required")
        self.api_key = api_key
        self.api_endpoint = "https://api.openai.com/v1/chat/completions"

    def _image_to_base64(self, image_path: str) -> str:
        """Convert image file to base64 string."""
        image_path = Path(image_path)
        if not image_path.exists():
            raise FileNotFoundError(f"Image file not found: {image_path}")
        
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')

    def _clean_markdown(self, content: str) -> str:
        """Clean markdown content by removing backticks and markdown indicators."""
        content = content.replace('```markdown', '').replace('```', '')
        return content.strip()

    def _parse_amount(self, text: str) -> Optional[float]:
        """Extract amount from text that contains a price."""
        match = re.search(r'\$(\d+\.?\d*)', text)
        if match:
            return float(match.group(1))
        return None

    def _parse_item_line(self, line: str) -> Optional[Tuple[str, str, float]]:
        """Parse a line containing item and price."""
        parts = re.split(r'\s{2,}|\t+', line)
        if len(parts) >= 2:
            item = parts[0].strip()
            amount_str = parts[-1].strip()
            amount = self._parse_amount(amount_str)
            if amount is not None:
                return item, amount_str, amount
        return None

    def _validate_receipt_calculations(self, receipt_data: Dict[str, Any]) -> Dict[str, Any]:
        """Validate receipt calculations and add validation metadata."""
        validation_results = {
            "is_valid": True,
            "errors": [],
            "warnings": []
        }

        items = receipt_data.get("items", [])
        total_data = receipt_data.get("total")
        subtotal_data = receipt_data.get("subtotal")
        tax_data = receipt_data.get("tax")

        # Calculate sum of items
        items_total = sum(item.get("value", 0) for item in items)

        # Validate total
        if total_data and "value" in total_data:
            declared_total = total_data["value"]
            if abs(declared_total - items_total) > 0.01:  # Allow for small rounding differences
                validation_results["is_valid"] = False
                validation_results["errors"].append({
                    "type": "total_mismatch",
                    "message": f"Total amount ({declared_total}) doesn't match sum of items ({items_total})",
                    "expected": items_total,
                    "found": declared_total
                })

        # Validate subtotal if present
        if subtotal_data and "value" in subtotal_data:
            if abs(subtotal_data["value"] - items_total) > 0.01:
                validation_results["warnings"].append({
                    "type": "subtotal_mismatch",
                    "message": f"Subtotal ({subtotal_data['value']}) doesn't match sum of items ({items_total})"
                })

        # Validate tax calculations if both subtotal and total are present
        if subtotal_data and total_data and tax_data:
            expected_tax = total_data["value"] - subtotal_data["value"]
            if abs(expected_tax - tax_data["value"]) > 0.01:
                validation_results["warnings"].append({
                    "type": "tax_mismatch",
                    "message": f"Tax amount ({tax_data['value']}) doesn't match difference between total and subtotal ({expected_tax})"
                })

        return validation_results

    def _calculate_receipt_metrics(self, receipt_data: Dict[str, Any]) -> Dict[str, Any]:
        """Calculate additional receipt metrics."""
        items = receipt_data.get("items", [])
        
        metrics = {
            "item_count": len(items),
            "average_item_price": 0,
            "highest_item": None,
            "lowest_item": None,
            "total_before_tax": sum(item.get("value", 0) for item in items)
        }

        if items:
            # Find highest and lowest priced items
            sorted_items = sorted(items, key=lambda x: x.get("value", 0))
            metrics["lowest_item"] = sorted_items[0]
            metrics["highest_item"] = sorted_items[-1]
            
            # Calculate average
            if metrics["item_count"] > 0:
                metrics["average_item_price"] = metrics["total_before_tax"] / metrics["item_count"]

        return metrics

    def _enhance_receipt_data(self, receipt_data: Dict[str, Any]) -> Dict[str, Any]:
        """Add validation and metrics to receipt data."""
        validation = self._validate_receipt_calculations(receipt_data)
        metrics = self._calculate_receipt_metrics(receipt_data)

        enhanced_data = receipt_data.copy()
        enhanced_data.update({
            "validation": validation,
            "metrics": metrics,
            "processing_info": {
                "timestamp": datetime.now().isoformat(),
                "version": "1.0"
            }
        })

        return enhanced_data

    def _try_docling_conversion(self, markdown_content: str) -> tuple[Optional[Dict[str, Any]], bool]:
        """Attempt to convert markdown using Docling."""
        with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as temp_file:
            temp_file.write(markdown_content)
            temp_path = temp_file.name

        try:
            print("\nAttempting Docling conversion...")
            doc = DoclingDocument.from_markdown(temp_path)
            docling_json = doc.to_dict()
            print("✓ Docling conversion successful")
            return docling_json, True
        except Exception as e:
            print(f"✗ Docling conversion failed: {e}")
            return None, False
        finally:
            Path(temp_path).unlink()

    def _fallback_conversion(self, markdown_content: str) -> Dict[str, Any]:
        """Enhanced fallback conversion using custom parser."""
        print("\nUsing enhanced fallback markdown parser...")
        content = self._clean_markdown(markdown_content)
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        
        result = {
            "type": "document",
            "content": [],
            "metadata": {
                "source": "ocr",
                "format": "markdown",
                "processed_at": datetime.now().isoformat(),
                "processor": "custom_fallback"
            },
            "structured_data": {
                "receipt": {
                    "title": None,
                    "items": [],
                    "total": None,
                    "subtotal": None,
                    "tax": None,
                    "currency": "USD"
                }
            }
        }

        current_section = "header"
        for line in lines:
            if all(c in '-' for c in line if c.strip()):
                continue
                
            if current_section == "header" and not any(c in line for c in ['$', '|']):
                result["structured_data"]["receipt"]["title"] = line
                result["content"].append({
                    "type": "header",
                    "content": line,
                    "level": 1
                })
                current_section = "items"
                continue
            
            parsed = self._parse_item_line(line)
            if parsed:
                item, amount_str, amount = parsed
                
                if 'total' in item.lower():
                    result["structured_data"]["receipt"]["total"] = {
                        "label": item.strip(),
                        "amount": amount_str,
                        "value": amount
                    }
                elif 'subtotal' in item.lower():
                    result["structured_data"]["receipt"]["subtotal"] = {
                        "label": item.strip(),
                        "amount": amount_str,
                        "value": amount
                    }
                elif 'tax' in item.lower():
                    result["structured_data"]["receipt"]["tax"] = {
                        "label": item.strip(),
                        "amount": amount_str,
                        "value": amount
                    }
                else:
                    item_data = {
                        "item": item.strip(),
                        "amount": amount_str,
                        "value": amount
                    }
                    result["structured_data"]["receipt"]["items"].append(item_data)
                    result["content"].append({
                        "type": "line_item",
                        "item": item.strip(),
                        "amount": amount_str,
                        "value": amount
                    })

        # Add validation and metrics
        if "receipt" in result["structured_data"]:
            result["structured_data"]["receipt"] = self._enhance_receipt_data(
                result["structured_data"]["receipt"]
            )

        return result

    def process_image(self, image_path: str) -> OCRResponse:
        """Process an image and extract text using GPT-4 Vision."""
        try:
            base64_image = self._image_to_base64(image_path)
            
            payload = {
                "model": "gpt-4o",  # Using the new recommended model
                "messages": [
                    {
                        "role": "system",
                        "content": "You are a helpful assistant that extracts text from images and formats it properly in markdown."
                    },
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": "Please extract all text from this image and format it in markdown, preserving any structure like tables, lists, and headers. Return only the extracted text without any explanations."
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}",
                                    "detail": "high"
                                }
                            }
                        ]
                    }
                ],
                "max_tokens": 1000,
                "temperature": 0
            }

            headers = {
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            }

            print("Sending request to OpenAI API...")
            print(f"Using model: {payload['model']}")
            
            response = requests.post(
                self.api_endpoint,
                json=payload,
                headers=headers
            )
            
            print(f"Response status: {response.status_code}")
            
            if response.status_code != 200:
                print(f"Error Response: {response.text}")
                response.raise_for_status()
            
            response_data = response.json()
            content = response_data["choices"][0]["message"]["content"]
            
            json_content, used_docling = self._try_docling_conversion(content)
            if not json_content:
                json_content = self._fallback_conversion(content)
            
            return OCRResponse(
                text=content,
                markdown=content,
                json=json_content,
                docling_processed=used_docling
            )

        except Exception as e:
            print(f"Error: {str(e)}")
            raise
def save_outputs(result: OCRResponse, base_dir: str = "output") -> None:
    """Save outputs to files with error handling."""
    try:
        output_dir = Path(base_dir)
        output_dir.mkdir(exist_ok=True)
        
        # Add processing info to JSON
        if isinstance(result.json, dict):
            result.json["metadata"] = {
                **result.json.get("metadata", {}),
                "processed_at": datetime.now().isoformat(),
                "processed_by": "docling" if result.docling_processed else "custom_parser"
            }
        
        # Save text output
        with open(output_dir / "output.txt", "w", encoding="utf-8") as f:
            f.write(result.text)
            
        # Save markdown output
        with open(output_dir / "output.md", "w", encoding="utf-8") as f:
            f.write(result.markdown)
            
        # Save JSON output
        with open(output_dir / "output.json", "w", encoding="utf-8") as f:
            json.dump(result.json, f, indent=2, ensure_ascii=False)
            
        # Print summary
        print(f"\nOutputs saved to {output_dir} directory")
        print(f"Processing method: {'Docling' if result.docling_processed else 'Custom parser'}")
        
        # Print receipt summary if available
        receipt_data = result.structured_data.get("receipt", {})
        if receipt_data:
            print("\nReceipt Summary:")
            print(f"Title: {receipt_data.get('title')}")
            
            validation = receipt_data.get("validation", {})
            if validation:
                print("\nValidation Results:")
                print(f"Valid: {'✓' if validation.get('is_valid', True) else '✗'}")
                
                if validation.get("errors"):
                    print("\nErrors:")
                    for error in validation["errors"]:
                        print(f"  ✗ {error['message']}")
                
                if validation.get("warnings"):
                    print("\nWarnings:")
                    for warning in validation["warnings"]:
                        print(f"  ⚠ {warning['message']}")

            metrics = receipt_data.get("metrics", {})
            if metrics:
                print("\nMetrics:")
                print(f"Item Count: {metrics['item_count']}")
                print(f"Average Price: ${metrics['average_item_price']:.2f}")
                if metrics.get('highest_item'):
                    print(f"Highest Item: {metrics['highest_item']['item']} (${metrics['highest_item']['value']:.2f})")
                if metrics.get('lowest_item'):
                    print(f"Lowest Item: {metrics['lowest_item']['item']} (${metrics['lowest_item']['value']:.2f})")

            print("\nItems:")
            for item in receipt_data.get('items', []):
                print(f"  - {item['item']}: {item['amount']}")
            
            if receipt_data.get('total'):
                print(f"\nTotal: {receipt_data['total']['amount']}")
        
    except Exception as e:
        print(f"Error saving outputs: {e}")
        raise

def main():
    """Main execution function with enhanced error handling."""
    # Get the project root directory (where .env is located)
    project_root = Path('/Users/mikewahl/Library/CloudStorage/OneDrive-TweddleGroupInc/Documents/projects/ocr-ai-json')
    env_path = project_root / '.env'

    print(f"Looking for .env at: {env_path}")
    print(f".env exists: {env_path.exists()}")

    # Load environment variables from the specific path
    load_dotenv(dotenv_path=env_path)

    # Get API key
    api_key = os.getenv('OPENAI_API_KEY')
    if not api_key:
        print("Error: OPENAI_API_KEY not found in environment variables")
        print("Please set your OpenAI API key using one of these methods:")
        print("1. Export it in your terminal: export OPENAI_API_KEY='your-key'")
        print("2. Add it to a .env file: OPENAI_API_KEY=your-key")
        return
    
    try:
        # Initialize OCR
        ocr = LlamaOCR(api_key=api_key)
        
        # Process image
        result = ocr.process_image("sample_images/sample_receipt.jpg")
        
        # Print results
        print("\nExtracted Text:")
        print(result.text)
        print("\nMarkdown Format:")
        print(result.markdown)
        print("\nJSON Format:")
        print(json.dumps(result.json, indent=2))
        
        # Save outputs
        save_outputs(result)
        
    except FileNotFoundError as e:
        print(f"File not found error: {e}")
    except requests.exceptions.RequestException as e:
        print(f"API request error: {e}")
    except json.JSONDecodeError as e:
        print(f"JSON parsing error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

if __name__ == "__main__":
    main()

I wanted to experiment with GPT-4 vision, reading an image and converting it to JSON for use with AI.

Here’s a concise summary of the project:

Written by Michael Wahl

No responses yet