arena-ai
diff --git a/‎backend/app/api/routes/dde.py
+4-105 b/‎backend/app/api/routes/dde.py
+4-105
diff --git a/‎backend/app/handlers/logprobs.py
+140 b/‎backend/app/handlers/logprobs.py
+140
diff --git a/‎backend/app/tests/api/routes/test_dde.py
+1-71 b/‎backend/app/tests/api/routes/test_dde.py
+1-71
@@ -32,6 +32,7 @@
 from openai.lib._pydantic import to_strict_json_schema
 from app.handlers.prompt_for_image import full_prompt_from_image
 from app.handlers.prompt_for_text import full_prompt_from_text
+from app.handlers.logprobs import map_characters_to_token_indices, extract_json_data
 
 from app.models import ContentType
 
@@ -484,109 +485,7 @@ async def extract_from_file(
     json_string = extracted_data[
         extracted_data.find("{") : extracted_data.rfind("}") + 1
     ]
-    #token_indices=map_characters_to_token_indices(extracted_data_token)
-    #regex_spans=find_value_spans(extracted_data)
-    #logprobs_sum=get_token_spans_and_logprobs(token_indices, regex_spans, extracted_data_token)
-    return {"extracted_data": json.loads(json_string), "extracted_logprobs": {}, "identifier": identifier}
-
-def map_characters_to_token_indices(extracted_data_token: list[TokenLogprob]) -> list[int]:
-    """
-    Maps each character in the JSON string output to its corresponding token index.
-    
-    Args:
-    extracted_data_token : A list of `TokenLogprob` objects, where each object represents a token and its data (such as the logprobs)
-
-    Returns:
-    A list of integers where each position corresponds to a character in the concatenated JSON string,
-    and the integer at each position is the index of the token responsible for generating that specific character in the JSON string.
-    
-    Example:
-    --------
-    Given `extracted_data_token = [TokenLogprob(token='{'), TokenLogprob(token='"key1"'), TokenLogprob(token=': '), TokenLogprob(token='"value1"'), TokenLogprob(token='}')]`
-    the JSON output is : '{"key1": "value1"}' and the function will return the list [0, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4]
-    
-    """
-
-    json_output = "".join(token_data.token for token_data in extracted_data_token)
-                    
-    token_indices = [None] * len(json_output)
-    current_char_pos = 0
-
-    for token_idx, token_data in enumerate(extracted_data_token):
-        token_text = token_data.token
-        for char_pos in range(len(token_text)):
-            token_indices[current_char_pos] = token_idx
-            current_char_pos += 1
-
-    return token_indices
-
-def find_value_spans(json_string: str) -> list[tuple[str, tuple[int, int]]]:
-    """
-    Extracts spans (start and end positions) of values (both strings or numbers) within a JSON-formatted string.
-
-    Args:
-    json_string : A JSON-formatted string where values are paired with keys and separated by colons.
-    
-    Returns:
-    A list of tuples, where each tuple contains the matched value of the key and a tuple with two integers (start, end), representing the character span of the respective value within `json_string`.
-
-    Example:
-    --------
-    Given `json_string = '{"key1": "value1"}'`, the function will return:
-        [("key1", (9, 17))]
-    """
- 
-    pattern = r'"([^"\n}]+)"\s*:\s*("[^"\n]+"|[-0-9.eE]+)\s*'
-
-    matches = []
-    for match in re.finditer(pattern, json_string):
-        value = match.group(1)
-        start = match.start(2)  
-        end = match.end(2)      
-        matches.append((value, (start, end)))
-    return matches
-
-
-def get_token_spans_and_logprobs(
-    token_indices: list[int], 
-    value_spans: list[tuple[str, tuple[int, int]]], 
-    extracted_data_token: list[TokenLogprob]
-) -> dict[str,float]:
-    """
-    Identifies the token indices for each value span and extracts the log probabilities for these tokens, summing them to provide an overall log probability for each value span. 
-
-    Args:
-        token_indices : A list mapping each character in the json string to a token index
-        value_spans : A list of tuples, each containing the value of the key and the character sapn within the JSON string
-        extracted_data_token : A list of `TokenLogprob` objects, each containing a token and its log probability data, where the index of each item corresponds to its token index.
-
-    Returns:
-    A dictionary mapping each key to the summed log probability of all the tokens that cotntains part of its value.
-
-
-    Example:
-    --------
-    Given:
-      - `token_indices = [0, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4]`, which maps each character to a token index.
-      - `value_spans = [("key1", (9, 17))]`.
-      - `extracted_data_token = [TokenLogprob(token="{", logprob=-1.5), TokenLogprob(token="key1", logprob=-1), TokenLogprob(token=": ", logprob=-1), TokenLogprob(token="value1", logprob=-1.5), TokenLogprob(token="}", logprob=-0.8)]`
-   
-    The function will return:
-      {"key1": -1.5} 
-    """
-    logprobs_for_values = {}
-
-    for value, (start, end) in value_spans:
-        token_start = token_indices[start]
-        token_end = token_indices[end] 
-        logprobs = [
-            extracted_data_token[token_idx].logprob
-            for token_idx in range(token_start, token_end)
-        ]
-        logprobs_for_values[value] = sum(logprobs)
-
-    return logprobs_for_values
-
-
-
+    token_indices=map_characters_to_token_indices(extracted_data_token)
+    extracted_data=extract_json_data(json_string, extracted_data_token, token_indices)
+    return {"extracted_data": extracted_data, "identifier": identifier}
 
@@ -0,0 +1,140 @@
+from lark import Lark, Transformer, v_args, Tree, Token
+from lark.tree import Meta
+from pydantic import BaseModel
+from typing import Any, Optional
+import math
+from app.lm.models.chat_completion import TokenLogprob
+
+class HasProb(BaseModel):
+    value: Any
+    start: int
+    end: int
+    logprob: float
+    prob: float
+
+def map_characters_to_token_indices(extracted_data_token: list[TokenLogprob]) -> list[int]:
+    """
+    Maps each character in the JSON string output to its corresponding token index.
+    
+    Args:
+    extracted_data_token : A list of `TokenLogprob` objects, where each object represents a token and its data (such as the logprobs)
+
+    Returns:
+    A list of integers where each position corresponds to a character in the concatenated JSON string,
+    and the integer at each position is the index of the token responsible for generating that specific character in the JSON string.
+    
+    Example:
+    --------
+    Given `extracted_data_token = [TokenLogprob(token='{'), TokenLogprob(token='"key1"'), TokenLogprob(token=': '), TokenLogprob(token='"value1"'), TokenLogprob(token='}')]`
+    the JSON output is : '{"key1": "value1"}' and the function will return the list [0, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4]
+    
+    """
+
+    json_output = "".join(token_data.token for token_data in extracted_data_token)
+                    
+    token_indices = [None] * len(json_output)
+    current_char_pos = 0
+
+    for token_idx, token_data in enumerate(extracted_data_token):
+        token_text = token_data.token
+        for char_pos in range(len(token_text)):
+            token_indices[current_char_pos] = token_idx
+            current_char_pos += 1
+
+    return token_indices
+    
+# Define a grammar for JSON
+json_grammar = r"""
+    start: value
+
+    ?value: object              #'?' is a Lark convention indicating that the rule can return the value directly instead of creating a separate parse tree node.
+          | array
+          | string
+          | SIGNED_NUMBER -> number    #'-> number' specifies an alias for the rule 
+          | "true"
+          | "false"
+          | "null"
+
+    array  : "[" [value ("," value)*] "]"
+    object : "{" [pair ("," pair)*] "}"
+    pair   : key ":" value
+    key    : ESCAPED_STRING
+
+    string : ESCAPED_STRING
+
+    %import common.ESCAPED_STRING
+    %import common.SIGNED_NUMBER
+    %import common.WS
+    %ignore WS
+"""
+
+@v_args(meta=True) 
+class Extractor(Transformer):
+    def __init__(self, tokens: list[TokenLogprob], token_indices: list[int]):
+        super().__init__()
+        self.tokens = tokens
+        self.token_indices = token_indices
+
+    def _compute_logprob_sum(self, start: int, end: int) -> float:
+        token_start = self.token_indices[start]
+        token_end = self.token_indices[end]
+        sum_logporb= sum(self.tokens[i].logprob for i in range(token_start, token_end))
+        return sum_logporb
+    
+    def number(self, meta: Meta, children: list[Token]) -> HasProb:
+        logprob_sum = self._compute_logprob_sum(meta.start_pos, meta.end_pos)
+        prob=math.exp(logprob_sum)* 100
+        return HasProb(value=float(children[0]), start=meta.start_pos, end=meta.end_pos, logprob=logprob_sum, prob=prob)
+
+    def string(self, meta: Meta, children: list[Token]) -> HasProb:
+        logprob_sum = self._compute_logprob_sum(meta.start_pos, meta.end_pos)
+        prob=math.exp(logprob_sum)* 100
+        return HasProb(value=children[0][1:-1], start=meta.start_pos, end=meta.end_pos, logprob=logprob_sum, prob=prob)
+
+    def true(self, meta: Meta, children: list[Token]) -> HasProb:
+        logprob_sum = self._compute_logprob_sum(meta.start_pos, meta.end_pos)
+        prob=math.exp(logprob_sum)* 100
+        return HasProb(value=True, start=meta.start_pos, end=meta.end_pos, logprob=logprob_sum, prob=prob)
+
+    def false(self, meta: Meta, children: list[Token]) -> HasProb:
+        logprob_sum = self._compute_logprob_sum(meta.start_pos, meta.end_pos)
+        prob=math.exp(logprob_sum)* 100
+        return HasProb(value=False, start=meta.start_pos, end=meta.end_pos, logprob=logprob_sum, prob=prob)
+
+    def null(self, meta: Meta, children: list[Token]):
+        return None
+    
+    def array(self, meta: Meta, children:list[dict[str, Any] | Any]) -> list[dict[str,Any] | Any]:
+        return [child.value if isinstance(child, HasProb) else child for child in children]
+    
+    def object(self, meta: Meta, children:list[tuple[str,Any]]) -> dict[str,Any]:
+        result = {}
+        for key, value in children:
+            if isinstance(value, HasProb):
+                result[key]=value.value
+                result[f"{key}_logprob"]=value.logprob
+                result[f"{key}_probability"]=value.prob
+            else:
+                result[key]=value
+        return result
+    
+    def pair(self, meta: Meta, children:list[str, Any]) -> tuple[str, Any]:  
+        value = children[1]
+        key = children[0]
+        if isinstance(value, Tree) and not value.children:    #['b', Tree(Token('RULE', 'value'), [])]
+            value = None
+        return key, value
+
+    def key(self, meta: Meta, children: list[Token]) -> str:
+        return children[0][1:-1]
+    
+    def start(self, meta: Meta, children:list[dict[str,Any]]) -> dict[str, Any]:
+        return children[0]
+    
+json_parser = Lark(json_grammar, parser="lalr", propagate_positions=True, maybe_placeholders=False)
+
+def extract_json_data(json_string: str, tokens: list[TokenLogprob], token_indices: list[int]) -> dict[str,Any]:
+    tree = json_parser.parse(json_string)
+    extractor = Extractor(tokens, token_indices)
+    return extractor.transform(tree)
+
@@ -6,12 +6,6 @@
 import pytest
 import json
 from typing import Generator, Any
-from app.api.routes.dde import (
-    map_characters_to_token_indices,
-    find_value_spans,
-    get_token_spans_and_logprobs,
-)
-
 
 @pytest.fixture(scope="module")
 def document_data_extractor(
@@ -201,69 +195,5 @@ def test_update_document_data_example(
         assert response_data["data"] == json.dumps(updated_data)
         assert response_data["document_data_extractor_id"] == 1
         assert response_data["id"] == 1
-
-
-class TokenLogprob:
-    def __init__(self, token: str, logprob: float):
-        self.token = token
-        self.logprob = logprob
-
-@pytest.fixture
-def data_token():
-    return [
-        TokenLogprob(token='{',  logprob = -1.9365e-07),          # Token index 0
-        TokenLogprob(token='"key1"',  logprob =  -0.01117),       # Token index 1
-        TokenLogprob(token=': "',  logprob = -0.00279),            # Token index 2
-        TokenLogprob(token='val', logprob = -1.1472e-06),        # Token index 3
-        TokenLogprob(token='ue1"', logprob = -0.00851),           # Token index 4
-        TokenLogprob(token=', "', logprob = -0.00851),            # Token index 5
-        TokenLogprob(token='key2', logprob = -0.00851),           # Token index 6
-        TokenLogprob(token='": ', logprob = -0.00851),            # Token index 7
-        TokenLogprob(token='42', logprob = -0.00851),             # Token index 8
-        TokenLogprob(token='}', logprob = -1.265e-07)             # Token index 9
-    ]
-
-@pytest.fixture
-def token_indices():
-    return [0,             
-            1, 1, 1, 1, 1, 1,  
-            2, 2, 2,           
-            3, 3, 3, 
-            4, 4, 4, 4,  
-            5, 5, 5,
-            6, 6, 6, 6,
-            7, 7, 7,
-            8, 8,
-            9] 
-    
-@pytest.fixture
-def sample_json_string():
-    return '{"key1": "value1", "key2": 42}'
-
-@pytest.fixture
-def value_spans():
-    return [
-        ("key1", (9, 17)),   
-        ("key2", (27, 29))        
-    ]
-
-def test_map_characters_to_token_indices(data_token, token_indices):             
-    result = map_characters_to_token_indices(data_token)
-
-    assert result == token_indices
-    assert result.count(1) == len(data_token[1].token)
-
-def test_find_value_spans(sample_json_string, value_spans):
-    result = find_value_spans(sample_json_string)
-
-    assert result == value_spans
-    assert sample_json_string[9:17] == '"value1"'
-    assert sample_json_string[27:29] == '42'
-
-def test_get_token_spans_and_logprobs(token_indices, value_spans, data_token):
-    expected_output = {"key1": -0.0113011472, "key2": -0.00851}
-    result = get_token_spans_and_logprobs(token_indices, value_spans, data_token)
-
-    assert result == expected_output
-    
+ 
 # TODO: test extract_from_file