Skip to content

expensevalidator #1739

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions ai/gen-ai-agents/expense validator/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
Copyright (c) 2025 Oracle and/or its affiliates.

The Universal Permissive License (UPL), Version 1.0

Subject to the condition set forth below, permission is hereby granted to any
person obtaining a copy of this software, associated documentation and/or data
(collectively the "Software"), free of charge and under any and all copyright
rights in the Software, and any and all patent rights owned or freely
licensable by each licensor hereunder covering either (i) the unmodified
Software as contributed to or provided by such licensor, or (ii) the Larger
Works (as defined below), to deal in both

(a) the Software, and
(b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
one is included with the Software (each a "Larger Work" to which the Software
is contributed by such licensors),

without restriction, including without limitation the rights to copy, create
derivative works of, display, perform, and distribute the Software and make,
use, sell, offer for sale, import, export, have made, and have sold the
Software and the Larger Work(s), and to sublicense the foregoing rights on
either these or other terms.

This license is subject to the following condition:
The above copyright notice and either this complete permission notice or at
a minimum a reference to the UPL must be included in all copies or
substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
220 changes: 220 additions & 0 deletions ai/gen-ai-agents/expense validator/files/backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
import fitz # PyMuPDF
import json
import tempfile
from typing import Dict, Tuple, Any
from langgraph.graph import StateGraph, START, END
from langchain_core.messages import HumanMessage
from pydantic import BaseModel
from typing_extensions import TypedDict

from oci_models import get_llm # LLM loader
from utils import remove_triple_backtics # Output cleaner

# Dummy API that simulates checking invoice value
def dummy_invoice_api_check(extracted_total: float) -> float:
return extracted_total

# --- Data Models ---
class ExtractedPDFData(BaseModel):
data: Dict[str, Any]

def make_hashable(self):
for key, value in self.data.items():
if isinstance(value, list):
self.data[key] = tuple(value)

class State(TypedDict):
pdf_path: str
declared_amount: float
extracted_information: ExtractedPDFData
validation_messages: list
error: str

# --- Agent ---
class ExpenseValidationAgent:
def extract_pdf_text(self, pdf_path: str) -> str:
text = ""
with fitz.open(pdf_path) as doc:
for page in doc:
text += page.get_text("text") + "\n"
return text.strip()

def process_pdf(self, pdf_path: str) -> ExtractedPDFData:
llm = get_llm()
text = self.extract_pdf_text(pdf_path)

# early check if PDF is unreadable
if not text or text.strip() == "":
raise Exception("❌ No readable text extracted from the uploaded PDF. It may be scanned badly or empty.")

prompt = f"""
Extract ONLY a valid JSON object from the following document.
No explanations, no formatting, no triple backticks.

Required fields:
- employee_name (string)
- claim_date (string)
- items (list of dicts with keys: 'description' (string), 'amount' (float), 'category' (string))
- total_amount (float)

Output must be a single valid JSON object.

Document:
{text}
"""

response = llm.invoke([{"role": "user", "content": prompt}])

if not response or not response.content or not response.content.strip():
raise Exception("❌ LLM returned an empty output. Cannot extract PDF information.")

cleaned = remove_triple_backtics(response.content.strip())

# early check if LLM output is blank
if not cleaned or cleaned.strip() == "":
raise Exception("❌ Cleaned LLM output is empty. No valid data to extract.")

if not cleaned.startswith("{"):
raise Exception(f"❌ LLM output does not start with a JSON object.\nRaw output:\n{cleaned}")

try:
data = json.loads(cleaned)
except Exception as e:
raise Exception(f"❌ Failed to parse LLM output as JSON.\nRaw output:\n{cleaned}\nError: {e}")

structured = ExtractedPDFData(data=data)
structured.make_hashable()
return structured

def llm_extract_node(self, state: State) -> Dict[str, Any]:
pdf_path = state["pdf_path"]
extracted_data = self.process_pdf(pdf_path)

if not extracted_data or not extracted_data.data:
return {"extracted_information": None, "error": "Failed to extract structured PDF content."}

return {"extracted_information": extracted_data, "error": None}

def check_policy_node(self, state: State) -> Dict[str, Any]:
llm = get_llm(temperature=0.0)
extracted = state["extracted_information"].data

policy_text = """..."""
prompt = f"""
Given the company policy:
{policy_text}

And the following expense claim:
{json.dumps(extracted, indent=2)}

Return a JSON object with:
- status: "pass" if the claim conforms, "fail" if it violates
- reason: 1-2 sentences explaining why

Respond ONLY with a valid JSON object. Do not add anything else.
"""

response = llm.invoke([HumanMessage(content=prompt)])
raw = response.content.strip()
cleaned = raw.replace("```json", "").replace("```", "").strip()

try:
result = json.loads(cleaned)
except Exception as e:
raise Exception(f"❌ LLM policy check did not return valid JSON.\nRaw output:\n{cleaned}\nError: {e}")

status = result.get("status", "").lower()
reason = result.get("reason", "No reason provided.")

label = "✅ Policy Check: " if status == "pass" else "❌ Policy Check: "
return {
"validation_messages": state.get("validation_messages", []) + [label + reason]
}

def check_category_node(self, state: State) -> Dict[str, Any]:
llm = get_llm(temperature=0.0)
extracted = state["extracted_information"].data

prompt = f"""
Given this expense data:
{json.dumps(extracted, indent=2)}

Are any of the expense items clearly mismatched? For example, if 'Bread' is categorized under 'Travel'.

Return a JSON object with:
- status: "pass" if all items are categorized correctly, "fail" if there are mismatches
- reason: 1-2 sentences explaining if any mismatch exists.

Respond ONLY with a valid JSON object.
"""

response = llm.invoke([HumanMessage(content=prompt)])
raw = response.content.strip()
cleaned = raw.replace("```json", "").replace("```", "").strip()

try:
result = json.loads(cleaned)
except Exception as e:
raise Exception(f"❌ LLM category check did not return valid JSON.\nRaw output:\n{cleaned}\nError: {e}")

status = result.get("status", "").lower()
reason = result.get("reason", "No reason provided.")

label = "✅ Category Check: " if status == "pass" else "❌ Category Check: "
return {
"validation_messages": state.get("validation_messages", []) + [label + reason]
}

def check_declared_amount_node(self, state: State) -> Dict[str, Any]:
extracted_total = state["extracted_information"].data.get("total_amount", 0.0)
api_total = dummy_invoice_api_check(extracted_total)
declared = state["declared_amount"]

if abs(api_total - declared) > 0.1:
return {"validation_messages": state.get("validation_messages", []) + [
f"⚠️ Declared amount mismatch. Declared: ${declared:.2f}, Backend Invoice: ${api_total:.2f}"
]}
else:
return {"validation_messages": state.get("validation_messages", []) + [
"✅ Declared Amount Check: No significant mismatch"
]}

def create_workflow(self):
graph = StateGraph(State)

graph.add_node("Extract", self.llm_extract_node)
graph.add_node("PolicyCheck", self.check_policy_node)
graph.add_node("CategoryCheck", self.check_category_node)
graph.add_node("AmountCheck", self.check_declared_amount_node)

graph.add_edge(START, "Extract")
graph.add_edge("Extract", "PolicyCheck")
graph.add_edge("PolicyCheck", "CategoryCheck")
graph.add_edge("CategoryCheck", "AmountCheck")
graph.add_edge("AmountCheck", END)

return graph.compile()

# --- Public API ---
def process_expense_workflow(pdf_bytes: bytes, declared_amount: float) -> Tuple[Dict[str, Any], list]:
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
temp_file.write(pdf_bytes)
temp_file.close()

agent = ExpenseValidationAgent()
workflow = agent.create_workflow()

initial_state = {
"pdf_path": temp_file.name,
"declared_amount": declared_amount,
"extracted_information": None,
"validation_messages": [],
"error": None
}

final_state = workflow.invoke(initial_state)

if final_state.get("error"):
raise Exception(final_state["error"])

return final_state["extracted_information"].data, final_state["validation_messages"]
8 changes: 8 additions & 0 deletions ai/gen-ai-agents/expense validator/files/config
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[DEFAULT]
user=ocid1.use
fingerprint=c6:4f:
tenancy=ocid1.te
region=eu-frankfurt-1
key_file=~/.


24 changes: 24 additions & 0 deletions ai/gen-ai-agents/expense validator/files/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""
OCI models configuration and general config
"""

DEBUG = False

MODEL_ID = "meta.llama-3.3-70b-instruct"

AUTH = "API_KEY"
SERVICE_ENDPOINT = "https://inference.generativeai.eu-frankfurt-1.oci.oraclecloud.com"

TEMPERATURE = 0.1
MAX_TOKENS = 1024
TOP_P = 0.9

# OCI general
COMPARTMENT_ID = "ocid1.compart.."

# history management
MAX_MSGS_IN_HISTORY = 10
# low, cause we're generating code
MAX_ROWS_IN_SAMPLE = 10


17 changes: 17 additions & 0 deletions ai/gen-ai-agents/expense validator/files/expense_validation_flow
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
digraph {
A [label="Upload Expense Claim PDF"]
B [label="Extract Data from PDF (LLM)"]
C [label="Policy Check
(conformance to rules)"]
D [label="Category Check
(mislabeling detection)"]
E [label="Declared Amount Check
(vs backend/API)"]
F [label="Display Results
(Green/Red Status)"]
A -> B
B -> C
C -> D
D -> E
E -> F
}
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
34 changes: 34 additions & 0 deletions ai/gen-ai-agents/expense validator/files/frontend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import streamlit as st
from backend import process_expense_workflow

st.set_page_config(page_title="Employee Expense Validator", page_icon="📤")

st.title("Employee Expense Claim Validator")

# File uploader
uploaded_file = st.file_uploader("Upload Expense Claim PDF", type=["pdf"])
declared_amount = st.number_input("Enter Declared Total Amount ($)", min_value=0.0, step=10.0)

if uploaded_file and declared_amount > 0:
st.success("✅ PDF uploaded and amount entered.")

with st.spinner("Running extraction and validation workflow..."):
try:
pdf_bytes = uploaded_file.read()
extracted_data, validations = process_expense_workflow(pdf_bytes, declared_amount)

st.subheader("🔎 Extracted Expense Data")
st.json(extracted_data, expanded=False)

st.subheader("Validation Results")
for message in validations:
if message.startswith("⚠️") or message.startswith("❌"):
st.error(message)
elif message.startswith("✅"):
st.success(message)
else:
st.info(message)

except Exception as e:
st.error(f"❌ Error: {str(e)}. Please check the uploaded file or inputs.")

25 changes: 25 additions & 0 deletions ai/gen-ai-agents/expense validator/files/graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from graphviz import Digraph

# Create a directed graph
dot = Digraph()

# Define the nodes
dot.node('A', 'Upload Expense Claim PDF')
dot.node('B', 'Extract Data from PDF (LLM)')
dot.node('C', 'Policy Check\n(conformance to rules)')
dot.node('D', 'Category Check\n(mislabeling detection)')
dot.node('E', 'Declared Amount Check\n(vs backend/API)')
dot.node('F', 'Display Results\n(Green/Red Status)')

# Define the edges
dot.edge('A', 'B')
dot.edge('B', 'C')
dot.edge('C', 'D')
dot.edge('D', 'E')
dot.edge('E', 'F')

# Render to file
dot.format = 'png'
dot.render('expense_validation_flow', view=False)

print("✅ Flowchart generated: 'expense_validation_flow.png'")
Loading