From a59b793d8b50f2d7f495a26da60a4edf0203fa9e Mon Sep 17 00:00:00 2001 From: Chaoying Date: Tue, 22 Mar 2022 14:40:47 +0800 Subject: [PATCH 1/5] Add PRef operator (#988) --- qlib/data/base.py | 4 +-- qlib/data/data.py | 31 +++++++++++++------ qlib/data/ops.py | 4 +-- qlib/data/pit.py | 17 +++++++++- scripts/data_collector/pit/test_pit.py | 43 +++++++++++++++++++++++++- 5 files changed, 84 insertions(+), 15 deletions(-) diff --git a/qlib/data/base.py b/qlib/data/base.py index 1a3fd1b0ed..9fd17c7c0e 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -254,10 +254,10 @@ class PFeature(Feature): def __str__(self): return "$$" + self._name - def _load_internal(self, instrument, start_index, end_index, cur_time): + def _load_internal(self, instrument, start_index, end_index, cur_time, period=None): from .data import PITD # pylint: disable=C0415 - return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time) + return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time, period) class ExpressionOps(Expression): diff --git a/qlib/data/data.py b/qlib/data/data.py index bb2a2acdb5..bc0750095e 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -12,7 +12,7 @@ import bisect import numpy as np import pandas as pd -from typing import List, Union +from typing import List, Union, Tuple # For supporting multiprocessing in outer code, joblib is used from joblib import delayed @@ -335,7 +335,15 @@ def feature(self, instrument, field, start_time, end_time, freq): class PITProvider(abc.ABC): @abc.abstractmethod - def period_feature(self, instrument, field, start_index: int, end_index: int, cur_time: pd.Timestamp) -> pd.Series: + def period_feature( + self, + instrument, + field, + start_index: int, + end_index: int, + cur_time: pd.Timestamp, + period_list: Tuple[int] = None, + ) -> pd.Series: """ get the historical periods data series between `start_index` and `end_index` @@ -732,7 +740,7 @@ class LocalPITProvider(PITProvider): # TODO: Add PIT backend file storage # NOTE: This class is not multi-threading-safe!!!! - def period_feature(self, instrument, field, start_index, end_index, cur_time): + def period_feature(self, instrument, field, start_index, end_index, cur_time, period=None): if not isinstance(cur_time, pd.Timestamp): raise ValueError( f"Expected pd.Timestamp for `cur_time`, got '{cur_time}'. Advices: you can't query PIT data directly(e.g. '$$roewa_q'), you must use `P` operator to convert data to each day (e.g. 'P($$roewa_q)')" @@ -771,8 +779,8 @@ def period_feature(self, instrument, field, start_index, end_index, cur_time): if not (index_path.exists() and data_path.exists()): raise FileNotFoundError("No file is found. Raise exception and ") # NOTE: The most significant performance loss is here. - # Does the accelration that makes the program complicated really matters? - # - It make parameters parameters of the interface complicate + # Does the acceleration that makes the program complicated really matters? + # - It makes parameters of the interface complicate # - It does not performance in the optimal way (places all the pieces together, we may achieve higher performance) # - If we design it carefully, we can go through for only once to get the historical evolution of the data. # So I decide to deprecated previous implementation and keep the logic of the program simple @@ -786,14 +794,19 @@ def period_feature(self, instrument, field, start_index, end_index, cur_time): return pd.Series() last_period = data["period"][:loc].max() # return the latest quarter first_period = data["period"][:loc].min() - period_list = get_period_list(first_period, last_period, quarterly) - period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index] + if period is not None: + if period not in period_list: + return pd.Series() + else: + period_list = [period] + else: + period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index] value = np.full((len(period_list),), np.nan, dtype=VALUE_DTYPE) - for i, period in enumerate(period_list): + for i, p in enumerate(period_list): # last_period_index = self.period_index[field].get(period) # For acceleration value[i], now_period_index = read_period_data( - index_path, data_path, period, cur_time_int, quarterly # , last_period_index # For acceleration + index_path, data_path, p, cur_time_int, quarterly # , last_period_index # For acceleration ) # self.period_index[field].update({period: now_period_index}) # For acceleration # NOTE: the index is period_list; So it may result in unexpected values(e.g. nan) diff --git a/qlib/data/ops.py b/qlib/data/ops.py index bdc032c037..2b742bebe0 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -1643,10 +1643,10 @@ def register_all_ops(C): """register all operator""" logger = get_module_logger("ops") - from qlib.data.pit import P # pylint: disable=C0415 + from qlib.data.pit import P, PRef # pylint: disable=C0415 Operators.reset() - Operators.register(OpsList + [P]) + Operators.register(OpsList + [P, PRef]) if getattr(C, "custom_ops", None) is not None: Operators.register(C.custom_ops) diff --git a/qlib/data/pit.py b/qlib/data/pit.py index ebe01eaf26..093b98cab3 100644 --- a/qlib/data/pit.py +++ b/qlib/data/pit.py @@ -37,7 +37,7 @@ def _load_internal(self, instrument, start_index, end_index, freq): # The calculated value will always the last element, so the end_offset is zero. try: - s = self.feature.load(instrument, -start_ws, 0, cur_time) + s = self._load_feature(instrument, -start_ws, 0, cur_time) resample_data[cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan except FileNotFoundError: get_module_logger("base").warning(f"WARN: period data not found for {str(self)}") @@ -48,6 +48,9 @@ def _load_internal(self, instrument, start_index, end_index, freq): ) return resample_series + def _load_feature(self, instrument, start_index, end_index, cur_time): + return self.feature.load(instrument, start_index, end_index, cur_time) + def get_longest_back_rolling(self): # The period data will collapse as a normal feature. So no extending and looking back return 0 @@ -55,3 +58,15 @@ def get_longest_back_rolling(self): def get_extended_window_size(self): # The period data will collapse as a normal feature. So no extending and looking back return 0, 0 + + +class PRef(P): + def __init__(self, feature, period): + super().__init__(feature) + self.period = period + + def __str__(self): + return f"{super().__str__()}[{self.period}]" + + def _load_feature(self, instrument, start_index, end_index, cur_time): + return self.feature.load(instrument, start_index, end_index, cur_time, self.period) diff --git a/scripts/data_collector/pit/test_pit.py b/scripts/data_collector/pit/test_pit.py index 4dedd85cf0..8801719c2b 100644 --- a/scripts/data_collector/pit/test_pit.py +++ b/scripts/data_collector/pit/test_pit.py @@ -92,7 +92,7 @@ def test_expr(self): "P((Ref($$roewa_q, 1) +$$roewa_q) / 2)", ] instruments = ["sh600519"] - data = D.features(instruments, fields, start_time="2019-01-01", end_time="20190719", freq="day") + data = D.features(instruments, fields, start_time="2019-01-01", end_time="2019-07-19", freq="day") expect = """ P(Mean($$roewa_q, 1)) P($$roewa_q) P(Mean($$roewa_q, 2)) P(Ref($$roewa_q, 1)) P((Ref($$roewa_q, 1) +$$roewa_q) / 2) instrument datetime @@ -189,6 +189,47 @@ def test_expr2(self): fields += ["P(Sum($$yoyni_q, 4))"] fields += ["$close", "P($$roewa_q) * $close"] data = D.features(instruments, fields, start_time="2019-01-01", end_time="2020-01-01", freq="day") + except_data = """ + P($$roewa_q) P($$yoyni_q) P(($$roewa_q / $$yoyni_q) / Ref($$roewa_q / $$yoyni_q, 1) - 1) P(Sum($$yoyni_q, 4)) $close P($$roewa_q) * $close + instrument datetime + sh600519 2019-01-02 0.255220 0.243892 1.484224 1.661578 63.595333 16.230801 + 2019-01-03 0.255220 0.243892 1.484224 1.661578 62.641907 15.987467 + 2019-01-04 0.255220 0.243892 1.484224 1.661578 63.915985 16.312637 + 2019-01-07 0.255220 0.243892 1.484224 1.661578 64.286530 16.407207 + 2019-01-08 0.255220 0.243892 1.484224 1.661578 64.212196 16.388237 + ... ... ... ... ... ... ... + 2019-12-25 0.255819 0.219821 0.677052 1.081693 122.150467 31.248409 + 2019-12-26 0.255819 0.219821 0.677052 1.081693 122.301315 31.286999 + 2019-12-27 0.255819 0.219821 0.677052 1.081693 125.307404 32.056015 + 2019-12-30 0.255819 0.219821 0.677052 1.081693 127.763992 32.684456 + 2019-12-31 0.255819 0.219821 0.677052 1.081693 127.462303 32.607277 + + [244 rows x 6 columns] + """ + self.check_same(data, except_data) + + def test_pref_operator(self): + instruments = ["sh600519"] + fields = ["PRef($$roewa_q, 201902)", "PRef($$yoyni_q, 201801)", "P($$roewa_q)"] + data = D.features(instruments, fields, start_time="2018-04-28", end_time="2019-07-19", freq="day") + except_data = """ + PRef($$roewa_q, 201902) PRef($$yoyni_q, 201801) P($$roewa_q) + instrument datetime + sh600519 2018-05-02 NaN 0.395075 0.088887 + 2018-05-03 NaN 0.395075 0.088887 + 2018-05-04 NaN 0.395075 0.088887 + 2018-05-07 NaN 0.395075 0.088887 + 2018-05-08 NaN 0.395075 0.088887 + ... ... ... ... + 2019-07-15 0.000000 0.395075 0.000000 + 2019-07-16 0.000000 0.395075 0.000000 + 2019-07-17 0.000000 0.395075 0.000000 + 2019-07-18 0.175322 0.395075 0.175322 + 2019-07-19 0.175322 0.395075 0.175322 + + [299 rows x 3 columns] + """ + self.check_same(data, except_data) if __name__ == "__main__": From 7bc8426f53045db6f0b7fdbd22614c8579487e1b Mon Sep 17 00:00:00 2001 From: Chaoying Date: Tue, 22 Mar 2022 14:53:46 +0800 Subject: [PATCH 2/5] Fix type annotations --- qlib/data/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qlib/data/data.py b/qlib/data/data.py index bc0750095e..2119b10246 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -12,7 +12,7 @@ import bisect import numpy as np import pandas as pd -from typing import List, Union, Tuple +from typing import List, Union, Optional # For supporting multiprocessing in outer code, joblib is used from joblib import delayed @@ -342,7 +342,7 @@ def period_feature( start_index: int, end_index: int, cur_time: pd.Timestamp, - period_list: Tuple[int] = None, + period: Optional[int] = None, ) -> pd.Series: """ get the historical periods data series between `start_index` and `end_index` From b607a1129005ed120c286a8adfe63641fc7b89dc Mon Sep 17 00:00:00 2001 From: Chaoying Date: Wed, 23 Mar 2022 15:56:38 +0800 Subject: [PATCH 3/5] Add test_pref_operator test case field --- scripts/data_collector/pit/test_pit.py | 35 +++++++++++++++----------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/scripts/data_collector/pit/test_pit.py b/scripts/data_collector/pit/test_pit.py index 8801719c2b..71d94ee8ba 100644 --- a/scripts/data_collector/pit/test_pit.py +++ b/scripts/data_collector/pit/test_pit.py @@ -210,24 +210,29 @@ def test_expr2(self): def test_pref_operator(self): instruments = ["sh600519"] - fields = ["PRef($$roewa_q, 201902)", "PRef($$yoyni_q, 201801)", "P($$roewa_q)"] + fields = [ + "PRef($$roewa_q, 201902)", + "PRef($$yoyni_q, 201801)", + "P($$roewa_q)", + "P($$roewa_q) / PRef($$roewa_q, 201801)", + ] data = D.features(instruments, fields, start_time="2018-04-28", end_time="2019-07-19", freq="day") except_data = """ - PRef($$roewa_q, 201902) PRef($$yoyni_q, 201801) P($$roewa_q) - instrument datetime - sh600519 2018-05-02 NaN 0.395075 0.088887 - 2018-05-03 NaN 0.395075 0.088887 - 2018-05-04 NaN 0.395075 0.088887 - 2018-05-07 NaN 0.395075 0.088887 - 2018-05-08 NaN 0.395075 0.088887 - ... ... ... ... - 2019-07-15 0.000000 0.395075 0.000000 - 2019-07-16 0.000000 0.395075 0.000000 - 2019-07-17 0.000000 0.395075 0.000000 - 2019-07-18 0.175322 0.395075 0.175322 - 2019-07-19 0.175322 0.395075 0.175322 + PRef($$roewa_q, 201902) PRef($$yoyni_q, 201801) P($$roewa_q) P($$roewa_q) / PRef($$roewa_q, 201801) + instrument datetime + sh600519 2018-05-02 NaN 0.395075 0.088887 1.000000 + 2018-05-03 NaN 0.395075 0.088887 1.000000 + 2018-05-04 NaN 0.395075 0.088887 1.000000 + 2018-05-07 NaN 0.395075 0.088887 1.000000 + 2018-05-08 NaN 0.395075 0.088887 1.000000 + ... ... ... ... ... + 2019-07-15 0.000000 0.395075 0.000000 0.000000 + 2019-07-16 0.000000 0.395075 0.000000 0.000000 + 2019-07-17 0.000000 0.395075 0.000000 0.000000 + 2019-07-18 0.175322 0.395075 0.175322 1.972414 + 2019-07-19 0.175322 0.395075 0.175322 1.972414 - [299 rows x 3 columns] + [299 rows x 4 columns] """ self.check_same(data, except_data) From ff031dd279ad1ce41a6a3eacabb1b0d1705f3cb7 Mon Sep 17 00:00:00 2001 From: Chaoying Date: Wed, 23 Mar 2022 16:00:30 +0800 Subject: [PATCH 4/5] Add note to PITProvider --- qlib/data/data.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/qlib/data/data.py b/qlib/data/data.py index 2119b10246..ee82a2e2ed 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -358,6 +358,11 @@ def period_feature( For example, start_index == -3 end_index == 0 and current period index is cur_idx, then the data between [start_index + cur_idx, end_index + cur_idx] will be retrieved. + period: int + This is used for query specific period. + The period is represented with int in Qlib. (e.g. 202001 may represent the first quarter in 2020) + NOTE: `period` will override `start_index` and `end_index` + Returns ------- pd.Series @@ -796,6 +801,7 @@ def period_feature(self, instrument, field, start_index, end_index, cur_time, pe first_period = data["period"][:loc].min() period_list = get_period_list(first_period, last_period, quarterly) if period is not None: + # NOTE: `period` has higher priority than `start_index` & `end_index` if period not in period_list: return pd.Series() else: From fee98c192d9f3aefc6a7a35960895fec526f5253 Mon Sep 17 00:00:00 2001 From: Chaoying Date: Thu, 24 Mar 2022 09:48:58 +0800 Subject: [PATCH 5/5] Add period parameter comment --- qlib/data/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/qlib/data/base.py b/qlib/data/base.py index 9fd17c7c0e..953c222535 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -162,6 +162,9 @@ def load(self, instrument, start_index, end_index, *args): 2) if is used in PIT data, it contains following arguments cur_pit: it is designed for the point-in-time data. + period: int + This is used for query specific period. + The period is represented with int in Qlib. (e.g. 202001 may represent the first quarter in 2020) Returns ----------