[Feat] update gsm8k and math agent config (#652)

yingfhu · web-flow · commit 9eb5cadcacaf · 2023-12-01T15:08:38.000+08:00
* [Feat] update gsm8k and math agent config

* minor fix
diff --git a/configs/datasets/math/math_agent_gen_861b4f.py b/configs/datasets/math/math_agent_gen_861b4f.py
@@ -0,0 +1,89 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import AgentInferencer
+from opencompass.datasets import (
+    MATHDataset, MATHAgentEvaluator, math_postprocess
+)
+# use pal format but not perform well
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                # # ################################### NEW SHOT ###################################
+                dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'),
+                dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:from sympy import symbols, simplify
+
+def solution():
+    x = symbols('x')
+    expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2)
+    simplified_expr = simplify(expr)
+
+    x3_coefficient = simplified_expr.as_coefficients_dict()[x**3]
+    result = x3_coefficient
+    return result"""),
+                dict(role='SYSTEM', prompt='Response:26'),
+                dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'),
+                dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'),
+                dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import math
+
+def solution():
+    radius = 6
+
+    # Surface area of the hemisphere
+    hemisphere_area = 2 * math.pi * radius**2
+
+    # Area of the circular base
+    base_area = math.pi * radius**2
+
+    # Total surface area
+    total_surface_area = hemisphere_area + base_area
+
+    # Formatting the result in LaTeX
+    result = r'{}\pi'.format(total_surface_area / math.pi)
+    return result"""),
+                dict(role='SYSTEM', prompt='Response:108.0\\pi'),
+                dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'),
+                dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die.  If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars).  If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'),
+                dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():
+    # Probabilities of each outcome
+    prime_prob = 1 / 6
+    composite_prob = 1 / 3
+    otherwise_prob = 1 / 6
+
+    # Expected value of each outcome
+    prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob)
+    composite_expected_value = 0 * composite_prob
+    otherwise_expected_value = -3 * otherwise_prob
+
+    # Total expected value
+    total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value
+
+    # Dollar value to the nearest cent
+    result = "{:.2f}".format(total_expected_value)
+    return result"""),
+                dict(role='SYSTEM', prompt='Response:1.17'),
+                dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'),
+                dict(role='HUMAN', prompt='{problem}'),
+            ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=AgentInferencer),
+)
+
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHAgentEvaluator),
+    pred_postprocessor=dict(type=math_postprocess),
+)
+
+math_datasets = [
+    dict(
+        abbr='math',
+        type=MATHDataset,
+        path='./data/math/math.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
diff --git a/opencompass/datasets/gsm8k.py b/opencompass/datasets/gsm8k.py
@@ -49,9 +49,10 @@ def gsm8k_postprocess(text: str) -> str:
             break
     ret1 = ''
     for i in range(len(ret)):
-        if ret[i].isdigit():
+        # deal with potential float number
+        if ret[i].isdigit() or ret[i] == '.':
             ret1 += ret[i]
-    return ret1
+    return ret1.strip('.')
 
 
 class Gsm8kEvaluator(BaseEvaluator):
@@ -87,15 +88,23 @@ class Gsm8kAgentEvaluator(BaseEvaluator):
     def __init__(self, action: str = 'PythonInterpreter'):
         self.action = action
 
+    def is_equal(self, pred, refer):
+        try:
+            if pred == refer or abs(float(pred) - int(refer)) < 1e-6:
+                return True
+        except Exception:
+            pass
+        return False
+
     def soft_equal(self, pred, refer, step):
         try:
             soft_pred = step['result']['text']
-            if str(int(float(soft_pred))) == refer:
+            if abs(float(soft_pred) - int(refer)) < 1e-6:
                 return True
         except Exception:
             # result might not exists
             # text cannot convert to float
-            print(pred, soft_pred, refer)
+            pass
         return False
 
     def get_action(self, step):
@@ -114,7 +123,7 @@ def score(self, predictions, references, steps):
         total = len(references)
         for pred, refer, step in zip(predictions, references, steps):
             # if final answer right
-            if pred == refer:
+            if self.is_equal(pred, refer):
                 if self.get_action(step):
                     final_scope += 1
                 else:
diff --git a/opencompass/models/huggingface.py b/opencompass/models/huggingface.py
@@ -49,6 +49,9 @@ class HuggingFace(BaseModel):
         use_fastchat_template (str, optional): Whether to use fastchat to get
             the conversation template. If True, fastchat needs to be
             implemented first. Defaults to False.
+        end_str (str, optional): Whether to trim generated strings with end_str
+            if the model has special ending strings that are not handled well.
+            Defaults to None.
 
     Note:
         About ``extract_pred_after_decode``: Commonly, we should extract the
@@ -72,7 +75,8 @@ def __init__(self,
                  batch_padding: bool = False,
                  pad_token_id: Optional[int] = None,
                  mode: str = 'none',
-                 use_fastchat_template: bool = False):
+                 use_fastchat_template: bool = False,
+                 end_str: Optional[str] = None):
         super().__init__(path=path,
                          max_seq_len=max_seq_len,
                          tokenizer_only=tokenizer_only,
@@ -96,6 +100,7 @@ def __init__(self,
                              peft_path=peft_path)
         self.generation_kwargs = generation_kwargs
         self.use_fastchat_template = use_fastchat_template
+        self.end_str = end_str
 
     def _load_tokenizer(self, path: str, tokenizer_path: Optional[str],
                         tokenizer_kwargs: dict):
@@ -266,6 +271,8 @@ def _batch_generate(self, inputs: List[str], max_out_len: int,
                 token[len_:] for token, len_ in zip(decodeds, prompt_lens)
             ]
 
+        if self.end_str:
+            decodeds = [token.split(self.end_str)[0] for token in decodeds]
         return decodeds
 
     def _single_generate(self, inputs: List[str], max_out_len: int,
@@ -329,6 +336,8 @@ def _single_generate(self, inputs: List[str], max_out_len: int,
                 token[len_:] for token, len_ in zip(decodeds, prompt_lens)
             ]
 
+        if self.end_str:
+            decodeds = [token.split(self.end_str)[0] for token in decodeds]
         return decodeds
 
     def get_logits(self, inputs: List[str]):