oracle-devrel · anshoracle · May 12, 2025 · May 6, 2025 · May 12, 2025
diff --git a/ai/gen-ai-agents/expense validator/LICENSE b/ai/gen-ai-agents/expense validator/LICENSE
@@ -0,0 +1,35 @@
+Copyright (c) 2025 Oracle and/or its affiliates.
+
+The Universal Permissive License (UPL), Version 1.0
+
+Subject to the condition set forth below, permission is hereby granted to any
+person obtaining a copy of this software, associated documentation and/or data
+(collectively the "Software"), free of charge and under any and all copyright
+rights in the Software, and any and all patent rights owned or freely
+licensable by each licensor hereunder covering either (i) the unmodified
+Software as contributed to or provided by such licensor, or (ii) the Larger
+Works (as defined below), to deal in both
+
+(a) the Software, and
+(b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
+one is included with the Software (each a "Larger Work" to which the Software
+is contributed by such licensors),
+
+without restriction, including without limitation the rights to copy, create
+derivative works of, display, perform, and distribute the Software and make,
+use, sell, offer for sale, import, export, have made, and have sold the
+Software and the Larger Work(s), and to sublicense the foregoing rights on
+either these or other terms.
+
+This license is subject to the following condition:
+The above copyright notice and either this complete permission notice or at
+a minimum a reference to the UPL must be included in all copies or
+substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/ai/gen-ai-agents/expense validator/files/Expenses/badcat1.pdf b/ai/gen-ai-agents/expense validator/files/Expenses/badcat1.pdf
diff --git a/ai/gen-ai-agents/expense validator/files/Expenses/badcatdutch.pdf b/ai/gen-ai-agents/expense validator/files/Expenses/badcatdutch.pdf
diff --git a/ai/gen-ai-agents/expense validator/files/Expenses/expense_policy.pdf b/ai/gen-ai-agents/expense validator/files/Expenses/expense_policy.pdf
diff --git a/ai/gen-ai-agents/expense validator/files/Expenses/greenone.pdf b/ai/gen-ai-agents/expense validator/files/Expenses/greenone.pdf
diff --git a/ai/gen-ai-agents/expense validator/files/Expenses/policydoc.pdf b/ai/gen-ai-agents/expense validator/files/Expenses/policydoc.pdf
diff --git a/ai/gen-ai-agents/expense validator/files/Expenses/wine.pdf b/ai/gen-ai-agents/expense validator/files/Expenses/wine.pdf
diff --git a/ai/gen-ai-agents/expense validator/files/backend.py b/ai/gen-ai-agents/expense validator/files/backend.py
@@ -0,0 +1,220 @@
+import fitz  # PyMuPDF
+import json
+import tempfile
+from typing import Dict, Tuple, Any
+from langgraph.graph import StateGraph, START, END
+from langchain_core.messages import HumanMessage
+from pydantic import BaseModel
+from typing_extensions import TypedDict
+
+from oci_models import get_llm  # LLM loader
+from utils import remove_triple_backtics  # Output cleaner
+
+# Dummy API that simulates checking invoice value
+def dummy_invoice_api_check(extracted_total: float) -> float:
+    return extracted_total
+
+# --- Data Models ---
+class ExtractedPDFData(BaseModel):
+    data: Dict[str, Any]
+
+    def make_hashable(self):
+        for key, value in self.data.items():
+            if isinstance(value, list):
+                self.data[key] = tuple(value)
+
+class State(TypedDict):
+    pdf_path: str
+    declared_amount: float
+    extracted_information: ExtractedPDFData
+    validation_messages: list
+    error: str
+
+# --- Agent ---
+class ExpenseValidationAgent:
+    def extract_pdf_text(self, pdf_path: str) -> str:
+        text = ""
+        with fitz.open(pdf_path) as doc:
+            for page in doc:
+                text += page.get_text("text") + "\n"
+        return text.strip()
+
+    def process_pdf(self, pdf_path: str) -> ExtractedPDFData:
+        llm = get_llm()
+        text = self.extract_pdf_text(pdf_path)
+
+        # early check if PDF is unreadable
+        if not text or text.strip() == "":
+            raise Exception("❌ No readable text extracted from the uploaded PDF. It may be scanned badly or empty.")
+
+        prompt = f"""
+        Extract ONLY a valid JSON object from the following document.
+        No explanations, no formatting, no triple backticks.
+
+        Required fields:
+        - employee_name (string)
+        - claim_date (string)
+        - items (list of dicts with keys: 'description' (string), 'amount' (float), 'category' (string))
+        - total_amount (float)
+
+        Output must be a single valid JSON object.
+
+        Document:
+        {text}
+        """
+
+        response = llm.invoke([{"role": "user", "content": prompt}])
+
+        if not response or not response.content or not response.content.strip():
+            raise Exception("❌ LLM returned an empty output. Cannot extract PDF information.")
+
+        cleaned = remove_triple_backtics(response.content.strip())
+
+        # early check if LLM output is blank
+        if not cleaned or cleaned.strip() == "":
+            raise Exception("❌ Cleaned LLM output is empty. No valid data to extract.")
+
+        if not cleaned.startswith("{"):
+            raise Exception(f"❌ LLM output does not start with a JSON object.\nRaw output:\n{cleaned}")
+
+        try:
+            data = json.loads(cleaned)
+        except Exception as e:
+            raise Exception(f"❌ Failed to parse LLM output as JSON.\nRaw output:\n{cleaned}\nError: {e}")
+
+        structured = ExtractedPDFData(data=data)
+        structured.make_hashable()
+        return structured
+
+    def llm_extract_node(self, state: State) -> Dict[str, Any]:
+        pdf_path = state["pdf_path"]
+        extracted_data = self.process_pdf(pdf_path)
+
+        if not extracted_data or not extracted_data.data:
+            return {"extracted_information": None, "error": "Failed to extract structured PDF content."}
+
+        return {"extracted_information": extracted_data, "error": None}
+
+    def check_policy_node(self, state: State) -> Dict[str, Any]:
+        llm = get_llm(temperature=0.0)
+        extracted = state["extracted_information"].data
+
+        policy_text = """..."""
+        prompt = f"""
+        Given the company policy:
+        {policy_text}
+
+        And the following expense claim:
+        {json.dumps(extracted, indent=2)}
+
+        Return a JSON object with:
+        - status: "pass" if the claim conforms, "fail" if it violates
+        - reason: 1-2 sentences explaining why
+
+        Respond ONLY with a valid JSON object. Do not add anything else.
+        """
+
+        response = llm.invoke([HumanMessage(content=prompt)])
+        raw = response.content.strip()
+        cleaned = raw.replace("```json", "").replace("```", "").strip()
+
+        try:
+            result = json.loads(cleaned)
+        except Exception as e:
+            raise Exception(f"❌ LLM policy check did not return valid JSON.\nRaw output:\n{cleaned}\nError: {e}")
+
+        status = result.get("status", "").lower()
+        reason = result.get("reason", "No reason provided.")
+
+        label = "✅ Policy Check: " if status == "pass" else "❌ Policy Check: "
+        return {
+            "validation_messages": state.get("validation_messages", []) + [label + reason]
+        }
+
+    def check_category_node(self, state: State) -> Dict[str, Any]:
+        llm = get_llm(temperature=0.0)
+        extracted = state["extracted_information"].data
+
+        prompt = f"""
+        Given this expense data:
+        {json.dumps(extracted, indent=2)}
+
+        Are any of the expense items clearly mismatched? For example, if 'Bread' is categorized under 'Travel'.
+
+        Return a JSON object with:
+        - status: "pass" if all items are categorized correctly, "fail" if there are mismatches
+        - reason: 1-2 sentences explaining if any mismatch exists.
+
+        Respond ONLY with a valid JSON object.
+        """
+
+        response = llm.invoke([HumanMessage(content=prompt)])
+        raw = response.content.strip()
+        cleaned = raw.replace("```json", "").replace("```", "").strip()
+
+        try:
+            result = json.loads(cleaned)
+        except Exception as e:
+            raise Exception(f"❌ LLM category check did not return valid JSON.\nRaw output:\n{cleaned}\nError: {e}")
+
+        status = result.get("status", "").lower()
+        reason = result.get("reason", "No reason provided.")
+
+        label = "✅ Category Check: " if status == "pass" else "❌ Category Check: "
+        return {
+            "validation_messages": state.get("validation_messages", []) + [label + reason]
+        }
+
+    def check_declared_amount_node(self, state: State) -> Dict[str, Any]:
+        extracted_total = state["extracted_information"].data.get("total_amount", 0.0)
+        api_total = dummy_invoice_api_check(extracted_total)
+        declared = state["declared_amount"]
+
+        if abs(api_total - declared) > 0.1:
+            return {"validation_messages": state.get("validation_messages", []) + [
+                f"⚠️ Declared amount mismatch. Declared: ${declared:.2f}, Backend Invoice: ${api_total:.2f}"
+            ]}
+        else:
+            return {"validation_messages": state.get("validation_messages", []) + [
+                "✅ Declared Amount Check: No significant mismatch"
+            ]}
+
+    def create_workflow(self):
+        graph = StateGraph(State)
+
+        graph.add_node("Extract", self.llm_extract_node)
+        graph.add_node("PolicyCheck", self.check_policy_node)
+        graph.add_node("CategoryCheck", self.check_category_node)
+        graph.add_node("AmountCheck", self.check_declared_amount_node)
+
+        graph.add_edge(START, "Extract")
+        graph.add_edge("Extract", "PolicyCheck")
+        graph.add_edge("PolicyCheck", "CategoryCheck")
+        graph.add_edge("CategoryCheck", "AmountCheck")
+        graph.add_edge("AmountCheck", END)
+
+        return graph.compile()
+
+# --- Public API ---
+def process_expense_workflow(pdf_bytes: bytes, declared_amount: float) -> Tuple[Dict[str, Any], list]:
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
+    temp_file.write(pdf_bytes)
+    temp_file.close()
+
+    agent = ExpenseValidationAgent()
+    workflow = agent.create_workflow()
+
+    initial_state = {
+        "pdf_path": temp_file.name,
+        "declared_amount": declared_amount,
+        "extracted_information": None,
+        "validation_messages": [],
+        "error": None
+    }
+
+    final_state = workflow.invoke(initial_state)
+
+    if final_state.get("error"):
+        raise Exception(final_state["error"])
+
+    return final_state["extracted_information"].data, final_state["validation_messages"]
diff --git a/ai/gen-ai-agents/expense validator/files/config b/ai/gen-ai-agents/expense validator/files/config
@@ -0,0 +1,8 @@
+[DEFAULT]
+user=ocid1.use
+fingerprint=c6:4f:
+tenancy=ocid1.te
+region=eu-frankfurt-1
+key_file=~/.
+
+
diff --git a/ai/gen-ai-agents/expense validator/files/config.py b/ai/gen-ai-agents/expense validator/files/config.py
@@ -0,0 +1,24 @@
+"""
+OCI models configuration and general config
+"""
+
+DEBUG = False
+
+MODEL_ID = "meta.llama-3.3-70b-instruct"
+
+AUTH = "API_KEY"
+SERVICE_ENDPOINT = "https://inference.generativeai.eu-frankfurt-1.oci.oraclecloud.com"
+
+TEMPERATURE = 0.1
+MAX_TOKENS = 1024
+TOP_P = 0.9
+
+# OCI general
+COMPARTMENT_ID = "ocid1.compart.."
+
+# history management
+MAX_MSGS_IN_HISTORY = 10
+# low, cause we're generating code
+MAX_ROWS_IN_SAMPLE = 10
+
+
diff --git a/ai/gen-ai-agents/expense validator/files/expense_validation_flow b/ai/gen-ai-agents/expense validator/files/expense_validation_flow
@@ -0,0 +1,17 @@
+digraph {
+	A [label="Upload Expense Claim PDF"]
+	B [label="Extract Data from PDF (LLM)"]
+	C [label="Policy Check
+(conformance to rules)"]
+	D [label="Category Check
+(mislabeling detection)"]
+	E [label="Declared Amount Check
+(vs backend/API)"]
+	F [label="Display Results
+(Green/Red Status)"]
+	A -> B
+	B -> C
+	C -> D
+	D -> E
+	E -> F
+}
diff --git a/ai/gen-ai-agents/expense validator/files/expense_validation_flow.png b/ai/gen-ai-agents/expense validator/files/expense_validation_flow.png
diff --git a/ai/gen-ai-agents/expense validator/files/frontend.py b/ai/gen-ai-agents/expense validator/files/frontend.py
@@ -0,0 +1,34 @@
+import streamlit as st
+from backend import process_expense_workflow
+
+st.set_page_config(page_title="Employee Expense Validator", page_icon="📤")
+
+st.title("Employee Expense Claim Validator")
+
+# File uploader
+uploaded_file = st.file_uploader("Upload Expense Claim PDF", type=["pdf"])
+declared_amount = st.number_input("Enter Declared Total Amount ($)", min_value=0.0, step=10.0)
+
+if uploaded_file and declared_amount > 0:
+    st.success("✅ PDF uploaded and amount entered.")
+
+    with st.spinner("Running extraction and validation workflow..."):
+        try:
+            pdf_bytes = uploaded_file.read()
+            extracted_data, validations = process_expense_workflow(pdf_bytes, declared_amount)
+
+            st.subheader("🔎 Extracted Expense Data")
+            st.json(extracted_data, expanded=False)
+
+            st.subheader("Validation Results")
+            for message in validations:
+                if message.startswith("⚠️") or message.startswith("❌"):
+                    st.error(message)
+                elif message.startswith("✅"):
+                    st.success(message)
+                else:
+                    st.info(message)
+
+        except Exception as e:
+            st.error(f"❌ Error: {str(e)}. Please check the uploaded file or inputs.")
+
diff --git a/ai/gen-ai-agents/expense validator/files/graph.py b/ai/gen-ai-agents/expense validator/files/graph.py
@@ -0,0 +1,25 @@
+from graphviz import Digraph
+
+# Create a directed graph
+dot = Digraph()
+
+# Define the nodes
+dot.node('A', 'Upload Expense Claim PDF')
+dot.node('B', 'Extract Data from PDF (LLM)')
+dot.node('C', 'Policy Check\n(conformance to rules)')
+dot.node('D', 'Category Check\n(mislabeling detection)')
+dot.node('E', 'Declared Amount Check\n(vs backend/API)')
+dot.node('F', 'Display Results\n(Green/Red Status)')
+
+# Define the edges
+dot.edge('A', 'B')
+dot.edge('B', 'C')
+dot.edge('C', 'D')
+dot.edge('D', 'E')
+dot.edge('E', 'F')
+
+# Render to file
+dot.format = 'png'
+dot.render('expense_validation_flow', view=False)
+
+print("✅ Flowchart generated: 'expense_validation_flow.png'")