Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add PRef operator (#988) #1000

Merged
merged 5 commits into from
Mar 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions qlib/data/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ def load(self, instrument, start_index, end_index, *args):
2) if is used in PIT data, it contains following arguments
cur_pit:
it is designed for the point-in-time data.
period: int
This is used for query specific period.
The period is represented with int in Qlib. (e.g. 202001 may represent the first quarter in 2020)

Returns
----------
Expand Down Expand Up @@ -254,10 +257,10 @@ class PFeature(Feature):
def __str__(self):
return "$$" + self._name

def _load_internal(self, instrument, start_index, end_index, cur_time):
def _load_internal(self, instrument, start_index, end_index, cur_time, period=None):
from .data import PITD # pylint: disable=C0415

return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time)
return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time, period)


class ExpressionOps(Expression):
Expand Down
37 changes: 28 additions & 9 deletions qlib/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import bisect
import numpy as np
import pandas as pd
from typing import List, Union
from typing import List, Union, Optional

# For supporting multiprocessing in outer code, joblib is used
from joblib import delayed
Expand Down Expand Up @@ -335,7 +335,15 @@ def feature(self, instrument, field, start_time, end_time, freq):

class PITProvider(abc.ABC):
@abc.abstractmethod
def period_feature(self, instrument, field, start_index: int, end_index: int, cur_time: pd.Timestamp) -> pd.Series:
def period_feature(
self,
instrument,
field,
start_index: int,
end_index: int,
cur_time: pd.Timestamp,
period: Optional[int] = None,
) -> pd.Series:
"""
get the historical periods data series between `start_index` and `end_index`

Expand All @@ -350,6 +358,11 @@ def period_feature(self, instrument, field, start_index: int, end_index: int, cu
For example, start_index == -3 end_index == 0 and current period index is cur_idx,
then the data between [start_index + cur_idx, end_index + cur_idx] will be retrieved.

period: int
This is used for query specific period.
The period is represented with int in Qlib. (e.g. 202001 may represent the first quarter in 2020)
NOTE: `period` will override `start_index` and `end_index`

Returns
-------
pd.Series
Expand Down Expand Up @@ -732,7 +745,7 @@ class LocalPITProvider(PITProvider):
# TODO: Add PIT backend file storage
# NOTE: This class is not multi-threading-safe!!!!

def period_feature(self, instrument, field, start_index, end_index, cur_time):
def period_feature(self, instrument, field, start_index, end_index, cur_time, period=None):
if not isinstance(cur_time, pd.Timestamp):
raise ValueError(
f"Expected pd.Timestamp for `cur_time`, got '{cur_time}'. Advices: you can't query PIT data directly(e.g. '$$roewa_q'), you must use `P` operator to convert data to each day (e.g. 'P($$roewa_q)')"
Expand Down Expand Up @@ -771,8 +784,8 @@ def period_feature(self, instrument, field, start_index, end_index, cur_time):
if not (index_path.exists() and data_path.exists()):
raise FileNotFoundError("No file is found. Raise exception and ")
# NOTE: The most significant performance loss is here.
# Does the accelration that makes the program complicated really matters?
# - It make parameters parameters of the interface complicate
# Does the acceleration that makes the program complicated really matters?
# - It makes parameters of the interface complicate
# - It does not performance in the optimal way (places all the pieces together, we may achieve higher performance)
# - If we design it carefully, we can go through for only once to get the historical evolution of the data.
# So I decide to deprecated previous implementation and keep the logic of the program simple
Expand All @@ -786,14 +799,20 @@ def period_feature(self, instrument, field, start_index, end_index, cur_time):
return pd.Series()
last_period = data["period"][:loc].max() # return the latest quarter
first_period = data["period"][:loc].min()

period_list = get_period_list(first_period, last_period, quarterly)
period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index]
if period is not None:
# NOTE: `period` has higher priority than `start_index` & `end_index`
if period not in period_list:
return pd.Series()
else:
period_list = [period]
else:
period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index]
value = np.full((len(period_list),), np.nan, dtype=VALUE_DTYPE)
for i, period in enumerate(period_list):
for i, p in enumerate(period_list):
# last_period_index = self.period_index[field].get(period) # For acceleration
value[i], now_period_index = read_period_data(
index_path, data_path, period, cur_time_int, quarterly # , last_period_index # For acceleration
index_path, data_path, p, cur_time_int, quarterly # , last_period_index # For acceleration
)
# self.period_index[field].update({period: now_period_index}) # For acceleration
# NOTE: the index is period_list; So it may result in unexpected values(e.g. nan)
Expand Down
4 changes: 2 additions & 2 deletions qlib/data/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1643,10 +1643,10 @@ def register_all_ops(C):
"""register all operator"""
logger = get_module_logger("ops")

from qlib.data.pit import P # pylint: disable=C0415
from qlib.data.pit import P, PRef # pylint: disable=C0415

Operators.reset()
Operators.register(OpsList + [P])
Operators.register(OpsList + [P, PRef])

if getattr(C, "custom_ops", None) is not None:
Operators.register(C.custom_ops)
Expand Down
17 changes: 16 additions & 1 deletion qlib/data/pit.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def _load_internal(self, instrument, start_index, end_index, freq):

# The calculated value will always the last element, so the end_offset is zero.
try:
s = self.feature.load(instrument, -start_ws, 0, cur_time)
s = self._load_feature(instrument, -start_ws, 0, cur_time)
resample_data[cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan
except FileNotFoundError:
get_module_logger("base").warning(f"WARN: period data not found for {str(self)}")
Expand All @@ -48,10 +48,25 @@ def _load_internal(self, instrument, start_index, end_index, freq):
)
return resample_series

def _load_feature(self, instrument, start_index, end_index, cur_time):
return self.feature.load(instrument, start_index, end_index, cur_time)

def get_longest_back_rolling(self):
# The period data will collapse as a normal feature. So no extending and looking back
return 0

def get_extended_window_size(self):
# The period data will collapse as a normal feature. So no extending and looking back
return 0, 0


class PRef(P):
def __init__(self, feature, period):
super().__init__(feature)
self.period = period

def __str__(self):
return f"{super().__str__()}[{self.period}]"

def _load_feature(self, instrument, start_index, end_index, cur_time):
return self.feature.load(instrument, start_index, end_index, cur_time, self.period)
48 changes: 47 additions & 1 deletion scripts/data_collector/pit/test_pit.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def test_expr(self):
"P((Ref($$roewa_q, 1) +$$roewa_q) / 2)",
]
instruments = ["sh600519"]
data = D.features(instruments, fields, start_time="2019-01-01", end_time="20190719", freq="day")
data = D.features(instruments, fields, start_time="2019-01-01", end_time="2019-07-19", freq="day")
expect = """
P(Mean($$roewa_q, 1)) P($$roewa_q) P(Mean($$roewa_q, 2)) P(Ref($$roewa_q, 1)) P((Ref($$roewa_q, 1) +$$roewa_q) / 2)
instrument datetime
Expand Down Expand Up @@ -189,6 +189,52 @@ def test_expr2(self):
fields += ["P(Sum($$yoyni_q, 4))"]
fields += ["$close", "P($$roewa_q) * $close"]
data = D.features(instruments, fields, start_time="2019-01-01", end_time="2020-01-01", freq="day")
except_data = """
P($$roewa_q) P($$yoyni_q) P(($$roewa_q / $$yoyni_q) / Ref($$roewa_q / $$yoyni_q, 1) - 1) P(Sum($$yoyni_q, 4)) $close P($$roewa_q) * $close
instrument datetime
sh600519 2019-01-02 0.255220 0.243892 1.484224 1.661578 63.595333 16.230801
2019-01-03 0.255220 0.243892 1.484224 1.661578 62.641907 15.987467
2019-01-04 0.255220 0.243892 1.484224 1.661578 63.915985 16.312637
2019-01-07 0.255220 0.243892 1.484224 1.661578 64.286530 16.407207
2019-01-08 0.255220 0.243892 1.484224 1.661578 64.212196 16.388237
... ... ... ... ... ... ...
2019-12-25 0.255819 0.219821 0.677052 1.081693 122.150467 31.248409
2019-12-26 0.255819 0.219821 0.677052 1.081693 122.301315 31.286999
2019-12-27 0.255819 0.219821 0.677052 1.081693 125.307404 32.056015
2019-12-30 0.255819 0.219821 0.677052 1.081693 127.763992 32.684456
2019-12-31 0.255819 0.219821 0.677052 1.081693 127.462303 32.607277

[244 rows x 6 columns]
"""
self.check_same(data, except_data)

def test_pref_operator(self):
instruments = ["sh600519"]
fields = [
"PRef($$roewa_q, 201902)",
"PRef($$yoyni_q, 201801)",
"P($$roewa_q)",
"P($$roewa_q) / PRef($$roewa_q, 201801)",
]
data = D.features(instruments, fields, start_time="2018-04-28", end_time="2019-07-19", freq="day")
except_data = """
PRef($$roewa_q, 201902) PRef($$yoyni_q, 201801) P($$roewa_q) P($$roewa_q) / PRef($$roewa_q, 201801)
instrument datetime
sh600519 2018-05-02 NaN 0.395075 0.088887 1.000000
2018-05-03 NaN 0.395075 0.088887 1.000000
2018-05-04 NaN 0.395075 0.088887 1.000000
2018-05-07 NaN 0.395075 0.088887 1.000000
2018-05-08 NaN 0.395075 0.088887 1.000000
... ... ... ... ...
2019-07-15 0.000000 0.395075 0.000000 0.000000
2019-07-16 0.000000 0.395075 0.000000 0.000000
2019-07-17 0.000000 0.395075 0.000000 0.000000
2019-07-18 0.175322 0.395075 0.175322 1.972414
2019-07-19 0.175322 0.395075 0.175322 1.972414

[299 rows x 4 columns]
"""
self.check_same(data, except_data)


if __name__ == "__main__":
Expand Down