From a59b793d8b50f2d7f495a26da60a4edf0203fa9e Mon Sep 17 00:00:00 2001
From: Chaoying <chaunceywe@gmail.com>
Date: Tue, 22 Mar 2022 14:40:47 +0800
Subject: [PATCH 1/5] Add PRef operator (#988)

---
 qlib/data/base.py                      |  4 +--
 qlib/data/data.py                      | 31 +++++++++++++------
 qlib/data/ops.py                       |  4 +--
 qlib/data/pit.py                       | 17 +++++++++-
 scripts/data_collector/pit/test_pit.py | 43 +++++++++++++++++++++++++-
 5 files changed, 84 insertions(+), 15 deletions(-)

diff --git a/qlib/data/base.py b/qlib/data/base.py
index 1a3fd1b0ed..9fd17c7c0e 100644
--- a/qlib/data/base.py
+++ b/qlib/data/base.py
@@ -254,10 +254,10 @@ class PFeature(Feature):
     def __str__(self):
         return "$$" + self._name
 
-    def _load_internal(self, instrument, start_index, end_index, cur_time):
+    def _load_internal(self, instrument, start_index, end_index, cur_time, period=None):
         from .data import PITD  # pylint: disable=C0415
 
-        return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time)
+        return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time, period)
 
 
 class ExpressionOps(Expression):
diff --git a/qlib/data/data.py b/qlib/data/data.py
index bb2a2acdb5..bc0750095e 100644
--- a/qlib/data/data.py
+++ b/qlib/data/data.py
@@ -12,7 +12,7 @@
 import bisect
 import numpy as np
 import pandas as pd
-from typing import List, Union
+from typing import List, Union, Tuple
 
 # For supporting multiprocessing in outer code, joblib is used
 from joblib import delayed
@@ -335,7 +335,15 @@ def feature(self, instrument, field, start_time, end_time, freq):
 
 class PITProvider(abc.ABC):
     @abc.abstractmethod
-    def period_feature(self, instrument, field, start_index: int, end_index: int, cur_time: pd.Timestamp) -> pd.Series:
+    def period_feature(
+        self,
+        instrument,
+        field,
+        start_index: int,
+        end_index: int,
+        cur_time: pd.Timestamp,
+        period_list: Tuple[int] = None,
+    ) -> pd.Series:
         """
         get the historical periods data series between `start_index` and `end_index`
 
@@ -732,7 +740,7 @@ class LocalPITProvider(PITProvider):
     # TODO: Add PIT backend file storage
     # NOTE: This class is not multi-threading-safe!!!!
 
-    def period_feature(self, instrument, field, start_index, end_index, cur_time):
+    def period_feature(self, instrument, field, start_index, end_index, cur_time, period=None):
         if not isinstance(cur_time, pd.Timestamp):
             raise ValueError(
                 f"Expected pd.Timestamp for `cur_time`, got '{cur_time}'. Advices: you can't query PIT data directly(e.g. '$$roewa_q'), you must use `P` operator to convert data to each day (e.g. 'P($$roewa_q)')"
@@ -771,8 +779,8 @@ def period_feature(self, instrument, field, start_index, end_index, cur_time):
         if not (index_path.exists() and data_path.exists()):
             raise FileNotFoundError("No file is found. Raise exception and  ")
         # NOTE: The most significant performance loss is here.
-        # Does the accelration that makes the program complicated really matters?
-        # - It make parameters parameters of the interface complicate
+        # Does the acceleration that makes the program complicated really matters?
+        # - It makes parameters of the interface complicate
         # - It does not performance in the optimal way (places all the pieces together, we may achieve higher performance)
         #    - If we design it carefully, we can go through for only once to get the historical evolution of the data.
         # So I decide to deprecated previous implementation and keep the logic of the program simple
@@ -786,14 +794,19 @@ def period_feature(self, instrument, field, start_index, end_index, cur_time):
             return pd.Series()
         last_period = data["period"][:loc].max()  # return the latest quarter
         first_period = data["period"][:loc].min()
-
         period_list = get_period_list(first_period, last_period, quarterly)
-        period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index]
+        if period is not None:
+            if period not in period_list:
+                return pd.Series()
+            else:
+                period_list = [period]
+        else:
+            period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index]
         value = np.full((len(period_list),), np.nan, dtype=VALUE_DTYPE)
-        for i, period in enumerate(period_list):
+        for i, p in enumerate(period_list):
             # last_period_index = self.period_index[field].get(period)  # For acceleration
             value[i], now_period_index = read_period_data(
-                index_path, data_path, period, cur_time_int, quarterly  # , last_period_index  # For acceleration
+                index_path, data_path, p, cur_time_int, quarterly  # , last_period_index  # For acceleration
             )
             # self.period_index[field].update({period: now_period_index})  # For acceleration
         # NOTE: the index is period_list; So it may result in unexpected values(e.g. nan)
diff --git a/qlib/data/ops.py b/qlib/data/ops.py
index bdc032c037..2b742bebe0 100644
--- a/qlib/data/ops.py
+++ b/qlib/data/ops.py
@@ -1643,10 +1643,10 @@ def register_all_ops(C):
     """register all operator"""
     logger = get_module_logger("ops")
 
-    from qlib.data.pit import P  # pylint: disable=C0415
+    from qlib.data.pit import P, PRef  # pylint: disable=C0415
 
     Operators.reset()
-    Operators.register(OpsList + [P])
+    Operators.register(OpsList + [P, PRef])
 
     if getattr(C, "custom_ops", None) is not None:
         Operators.register(C.custom_ops)
diff --git a/qlib/data/pit.py b/qlib/data/pit.py
index ebe01eaf26..093b98cab3 100644
--- a/qlib/data/pit.py
+++ b/qlib/data/pit.py
@@ -37,7 +37,7 @@ def _load_internal(self, instrument, start_index, end_index, freq):
 
             # The calculated value will always the last element, so the end_offset is zero.
             try:
-                s = self.feature.load(instrument, -start_ws, 0, cur_time)
+                s = self._load_feature(instrument, -start_ws, 0, cur_time)
                 resample_data[cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan
             except FileNotFoundError:
                 get_module_logger("base").warning(f"WARN: period data not found for {str(self)}")
@@ -48,6 +48,9 @@ def _load_internal(self, instrument, start_index, end_index, freq):
         )
         return resample_series
 
+    def _load_feature(self, instrument, start_index, end_index, cur_time):
+        return self.feature.load(instrument, start_index, end_index, cur_time)
+
     def get_longest_back_rolling(self):
         # The period data will collapse as a normal feature. So no extending and looking back
         return 0
@@ -55,3 +58,15 @@ def get_longest_back_rolling(self):
     def get_extended_window_size(self):
         # The period data will collapse as a normal feature. So no extending and looking back
         return 0, 0
+
+
+class PRef(P):
+    def __init__(self, feature, period):
+        super().__init__(feature)
+        self.period = period
+
+    def __str__(self):
+        return f"{super().__str__()}[{self.period}]"
+
+    def _load_feature(self, instrument, start_index, end_index, cur_time):
+        return self.feature.load(instrument, start_index, end_index, cur_time, self.period)
diff --git a/scripts/data_collector/pit/test_pit.py b/scripts/data_collector/pit/test_pit.py
index 4dedd85cf0..8801719c2b 100644
--- a/scripts/data_collector/pit/test_pit.py
+++ b/scripts/data_collector/pit/test_pit.py
@@ -92,7 +92,7 @@ def test_expr(self):
             "P((Ref($$roewa_q, 1) +$$roewa_q) / 2)",
         ]
         instruments = ["sh600519"]
-        data = D.features(instruments, fields, start_time="2019-01-01", end_time="20190719", freq="day")
+        data = D.features(instruments, fields, start_time="2019-01-01", end_time="2019-07-19", freq="day")
         expect = """
                                P(Mean($$roewa_q, 1))  P($$roewa_q)  P(Mean($$roewa_q, 2))  P(Ref($$roewa_q, 1))  P((Ref($$roewa_q, 1) +$$roewa_q) / 2)
         instrument datetime
@@ -189,6 +189,47 @@ def test_expr2(self):
         fields += ["P(Sum($$yoyni_q, 4))"]
         fields += ["$close", "P($$roewa_q) * $close"]
         data = D.features(instruments, fields, start_time="2019-01-01", end_time="2020-01-01", freq="day")
+        except_data = """
+                                       P($$roewa_q)  P($$yoyni_q)  P(($$roewa_q / $$yoyni_q) / Ref($$roewa_q / $$yoyni_q, 1) - 1)  P(Sum($$yoyni_q, 4))      $close  P($$roewa_q) * $close
+        instrument datetime                                                                                                                                                       
+        sh600519   2019-01-02      0.255220      0.243892                                           1.484224                           1.661578   63.595333              16.230801
+                   2019-01-03      0.255220      0.243892                                           1.484224                           1.661578   62.641907              15.987467
+                   2019-01-04      0.255220      0.243892                                           1.484224                           1.661578   63.915985              16.312637
+                   2019-01-07      0.255220      0.243892                                           1.484224                           1.661578   64.286530              16.407207
+                   2019-01-08      0.255220      0.243892                                           1.484224                           1.661578   64.212196              16.388237
+        ...                             ...           ...                                                ...                                ...         ...                    ...
+                   2019-12-25      0.255819      0.219821                                           0.677052                           1.081693  122.150467              31.248409
+                   2019-12-26      0.255819      0.219821                                           0.677052                           1.081693  122.301315              31.286999
+                   2019-12-27      0.255819      0.219821                                           0.677052                           1.081693  125.307404              32.056015
+                   2019-12-30      0.255819      0.219821                                           0.677052                           1.081693  127.763992              32.684456
+                   2019-12-31      0.255819      0.219821                                           0.677052                           1.081693  127.462303              32.607277
+        
+        [244 rows x 6 columns]
+        """
+        self.check_same(data, except_data)
+
+    def test_pref_operator(self):
+        instruments = ["sh600519"]
+        fields = ["PRef($$roewa_q, 201902)", "PRef($$yoyni_q, 201801)", "P($$roewa_q)"]
+        data = D.features(instruments, fields, start_time="2018-04-28", end_time="2019-07-19", freq="day")
+        except_data = """
+                               PRef($$roewa_q, 201902)  PRef($$yoyni_q, 201801)  P($$roewa_q)
+        instrument datetime                                                                  
+        sh600519   2018-05-02                      NaN                 0.395075      0.088887
+                   2018-05-03                      NaN                 0.395075      0.088887
+                   2018-05-04                      NaN                 0.395075      0.088887
+                   2018-05-07                      NaN                 0.395075      0.088887
+                   2018-05-08                      NaN                 0.395075      0.088887
+        ...                                        ...                      ...           ...
+                   2019-07-15                 0.000000                 0.395075      0.000000
+                   2019-07-16                 0.000000                 0.395075      0.000000
+                   2019-07-17                 0.000000                 0.395075      0.000000
+                   2019-07-18                 0.175322                 0.395075      0.175322
+                   2019-07-19                 0.175322                 0.395075      0.175322
+        
+        [299 rows x 3 columns]
+        """
+        self.check_same(data, except_data)
 
 
 if __name__ == "__main__":

From 7bc8426f53045db6f0b7fdbd22614c8579487e1b Mon Sep 17 00:00:00 2001
From: Chaoying <chaunceywe@gmail.com>
Date: Tue, 22 Mar 2022 14:53:46 +0800
Subject: [PATCH 2/5] Fix type annotations

---
 qlib/data/data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qlib/data/data.py b/qlib/data/data.py
index bc0750095e..2119b10246 100644
--- a/qlib/data/data.py
+++ b/qlib/data/data.py
@@ -12,7 +12,7 @@
 import bisect
 import numpy as np
 import pandas as pd
-from typing import List, Union, Tuple
+from typing import List, Union, Optional
 
 # For supporting multiprocessing in outer code, joblib is used
 from joblib import delayed
@@ -342,7 +342,7 @@ def period_feature(
         start_index: int,
         end_index: int,
         cur_time: pd.Timestamp,
-        period_list: Tuple[int] = None,
+        period: Optional[int] = None,
     ) -> pd.Series:
         """
         get the historical periods data series between `start_index` and `end_index`

From b607a1129005ed120c286a8adfe63641fc7b89dc Mon Sep 17 00:00:00 2001
From: Chaoying <chaunceywe@gmail.com>
Date: Wed, 23 Mar 2022 15:56:38 +0800
Subject: [PATCH 3/5] Add test_pref_operator test case field

---
 scripts/data_collector/pit/test_pit.py | 35 +++++++++++++++-----------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/scripts/data_collector/pit/test_pit.py b/scripts/data_collector/pit/test_pit.py
index 8801719c2b..71d94ee8ba 100644
--- a/scripts/data_collector/pit/test_pit.py
+++ b/scripts/data_collector/pit/test_pit.py
@@ -210,24 +210,29 @@ def test_expr2(self):
 
     def test_pref_operator(self):
         instruments = ["sh600519"]
-        fields = ["PRef($$roewa_q, 201902)", "PRef($$yoyni_q, 201801)", "P($$roewa_q)"]
+        fields = [
+            "PRef($$roewa_q, 201902)",
+            "PRef($$yoyni_q, 201801)",
+            "P($$roewa_q)",
+            "P($$roewa_q) / PRef($$roewa_q, 201801)",
+        ]
         data = D.features(instruments, fields, start_time="2018-04-28", end_time="2019-07-19", freq="day")
         except_data = """
-                               PRef($$roewa_q, 201902)  PRef($$yoyni_q, 201801)  P($$roewa_q)
-        instrument datetime                                                                  
-        sh600519   2018-05-02                      NaN                 0.395075      0.088887
-                   2018-05-03                      NaN                 0.395075      0.088887
-                   2018-05-04                      NaN                 0.395075      0.088887
-                   2018-05-07                      NaN                 0.395075      0.088887
-                   2018-05-08                      NaN                 0.395075      0.088887
-        ...                                        ...                      ...           ...
-                   2019-07-15                 0.000000                 0.395075      0.000000
-                   2019-07-16                 0.000000                 0.395075      0.000000
-                   2019-07-17                 0.000000                 0.395075      0.000000
-                   2019-07-18                 0.175322                 0.395075      0.175322
-                   2019-07-19                 0.175322                 0.395075      0.175322
+                               PRef($$roewa_q, 201902)  PRef($$yoyni_q, 201801)  P($$roewa_q)  P($$roewa_q) / PRef($$roewa_q, 201801)
+        instrument datetime                                                                                                          
+        sh600519   2018-05-02                      NaN                 0.395075      0.088887                                1.000000
+                   2018-05-03                      NaN                 0.395075      0.088887                                1.000000
+                   2018-05-04                      NaN                 0.395075      0.088887                                1.000000
+                   2018-05-07                      NaN                 0.395075      0.088887                                1.000000
+                   2018-05-08                      NaN                 0.395075      0.088887                                1.000000
+        ...                                        ...                      ...           ...                                     ...
+                   2019-07-15                 0.000000                 0.395075      0.000000                                0.000000
+                   2019-07-16                 0.000000                 0.395075      0.000000                                0.000000
+                   2019-07-17                 0.000000                 0.395075      0.000000                                0.000000
+                   2019-07-18                 0.175322                 0.395075      0.175322                                1.972414
+                   2019-07-19                 0.175322                 0.395075      0.175322                                1.972414
         
-        [299 rows x 3 columns]
+        [299 rows x 4 columns]
         """
         self.check_same(data, except_data)
 

From ff031dd279ad1ce41a6a3eacabb1b0d1705f3cb7 Mon Sep 17 00:00:00 2001
From: Chaoying <chaunceywe@gmail.com>
Date: Wed, 23 Mar 2022 16:00:30 +0800
Subject: [PATCH 4/5] Add note to PITProvider

---
 qlib/data/data.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/qlib/data/data.py b/qlib/data/data.py
index 2119b10246..ee82a2e2ed 100644
--- a/qlib/data/data.py
+++ b/qlib/data/data.py
@@ -358,6 +358,11 @@ def period_feature(
             For example, start_index == -3 end_index == 0 and current period index is cur_idx,
             then the data between [start_index + cur_idx, end_index + cur_idx] will be retrieved.
 
+        period: int
+            This is used for query specific period.
+            The period is represented with int in Qlib. (e.g. 202001 may represent the first quarter in 2020)
+            NOTE: `period`  will override `start_index` and `end_index`
+
         Returns
         -------
         pd.Series
@@ -796,6 +801,7 @@ def period_feature(self, instrument, field, start_index, end_index, cur_time, pe
         first_period = data["period"][:loc].min()
         period_list = get_period_list(first_period, last_period, quarterly)
         if period is not None:
+            # NOTE: `period` has higher priority than `start_index` & `end_index`
             if period not in period_list:
                 return pd.Series()
             else:

From fee98c192d9f3aefc6a7a35960895fec526f5253 Mon Sep 17 00:00:00 2001
From: Chaoying <chaunceywe@gmail.com>
Date: Thu, 24 Mar 2022 09:48:58 +0800
Subject: [PATCH 5/5] Add period parameter comment

---
 qlib/data/base.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/qlib/data/base.py b/qlib/data/base.py
index 9fd17c7c0e..953c222535 100644
--- a/qlib/data/base.py
+++ b/qlib/data/base.py
@@ -162,6 +162,9 @@ def load(self, instrument, start_index, end_index, *args):
         2) if is used in PIT data, it contains following arguments
             cur_pit:
                 it is designed for the point-in-time data.
+            period: int
+                This is used for query specific period.
+                The period is represented with int in Qlib. (e.g. 202001 may represent the first quarter in 2020)
 
         Returns
         ----------