-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathtrain_trade_data_sentiment_chunk.py
187 lines (154 loc) · 6.46 KB
/
train_trade_data_sentiment_chunk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import pandas as pd
import numpy as np
import datetime
import yfinance as yf
import itertools
from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl.config import INDICATORS
# Constants
SENTIMENT_COLUMN = 'sentiment_deepseek'
CHUNK_SIZE = 1000
TRAIN_START_DATE = '2013-01-01'
TRAIN_END_DATE = '2018-12-31'
TRADE_START_DATE = '2019-01-01'
TRADE_END_DATE = '2023-12-31'
def load_seagen_data():
"""Load and process Seagen stock data"""
df_seagen = pd.read_csv('seagen.csv')
# Rename columns
df_seagen.rename(columns={
'Date': 'date',
'Price': 'close',
'Open': 'open',
'High': 'high',
'Low': 'low',
'Vol.': 'volume',
'Change %': 'change_percent'
}, inplace=True)
# Add tic column
df_seagen['tic'] = 'SGEN'
# Convert volume strings to numeric
def convert_volume(vol_str):
if isinstance(vol_str, str):
if vol_str.endswith('M'):
return float(vol_str.replace('M', '')) * 1e6
elif vol_str.endswith('K'):
return float(vol_str.replace('K', '')) * 1e3
return float(vol_str)
df_seagen['volume'] = df_seagen['volume'].apply(convert_volume)
df_seagen['date'] = pd.to_datetime(df_seagen['date'], format='%m/%d/%Y')
return df_seagen.drop(columns=['change_percent'])
def prepare_stock_data(nasdaq_tickers):
"""Prepare stock data including Seagen"""
# Download data for other stocks
df_raw = YahooDownloader(
start_date=TRAIN_START_DATE,
end_date=TRADE_END_DATE,
ticker_list=[tic for tic in nasdaq_tickers if tic != "SGEN"]
).fetch_data()
# Load and combine Seagen data
df_seagen = load_seagen_data()
df_combined = pd.concat([df_raw, df_seagen], ignore_index=True)
df_combined.sort_values(by='date', inplace=True)
# Convert date to string format for consistency
df_combined['date'] = df_combined['date'].dt.strftime('%Y-%m-%d')
return df_combined
def process_data(df_raw):
"""Process raw data with feature engineering"""
fe = FeatureEngineer(
use_technical_indicator=True,
tech_indicator_list=INDICATORS,
use_vix=True,
use_turbulence=True,
user_defined_feature=False
)
processed = fe.preprocess_data(df_raw)
# Create full processed dataset
list_ticker = processed["tic"].unique().tolist()
list_date = list(pd.date_range(processed['date'].min(), processed['date'].max()).astype(str))
combination = list(itertools.product(list_date, list_ticker))
processed_full = pd.DataFrame(combination, columns=["date", "tic"]).merge(
processed, on=["date", "tic"], how="left"
)
processed_full = processed_full[processed_full['date'].isin(processed['date'])]
processed_full = processed_full.sort_values(['date', 'tic'])
processed_full = processed_full.ffill()
return processed_full
def process_with_sentiment_chunks(df, sentiment_file, output_file):
"""Process and save data with sentiment in chunks"""
# Convert date to datetime for merging
df['date'] = pd.to_datetime(df['date'])
# Initialize chunk reader for sentiment data
sentiment_chunks = pd.read_csv(
sentiment_file,
usecols=['Date', 'Stock_symbol', SENTIMENT_COLUMN],
chunksize=CHUNK_SIZE
)
# Process first chunk to create output file
first_chunk = True
for chunk_num, sentiment_chunk in enumerate(sentiment_chunks, 1):
print(f"Processing sentiment chunk {chunk_num}...")
# Prepare sentiment chunk
sentiment_chunk['Date'] = pd.to_datetime(sentiment_chunk['Date']).dt.tz_localize(None)
sentiment_chunk.rename(
columns={'Stock_symbol': 'tic', SENTIMENT_COLUMN: 'llm_sentiment'},
inplace=True
)
# Merge chunk with main dataframe
merged_chunk = df.merge(
sentiment_chunk[['Date', 'tic', 'llm_sentiment']],
left_on=['date', 'tic'],
right_on=['Date', 'tic'],
how='left'
)
# Clean up merged chunk
merged_chunk = merged_chunk.drop(columns=['Date'])
# Save to CSV
if first_chunk:
merged_chunk.to_csv(output_file, index=False)
first_chunk = False
else:
# Append without header
merged_chunk.to_csv(output_file, mode='a', header=False, index=False)
# Clear memory
del merged_chunk
def main():
# Nasdaq 100 tickers (July 17, 2023)
nasdaq_100_tickers = [
"ADBE", "ADP", "ABNB", "ALGN", "GOOGL", "GOOG", "AMZN", "AMD", "AEP", "AMGN",
"ADI", "ANSS", "AAPL", "AMAT", "ASML", "AZN", "TEAM", "ADSK", "BKR", "BIIB",
"BKNG", "AVGO", "CDNS", "CHTR", "CTAS", "CSCO", "CTSH", "CMCSA", "CEG", "CPRT",
"CSGP", "COST", "CRWD", "CSX", "DDOG", "DXCM", "FANG", "DLTR", "EBAY", "EA",
"ENPH", "EXC", "FAST", "FTNT", "GEHC", "GILD", "GFS", "HON", "IDXX", "ILMN",
"INTC", "INTU", "ISRG", "JD", "KDP", "KLAC", "KHC", "LRCX", "LCID", "LULU",
"MAR", "MRVL", "MELI", "META", "MCHP", "MU", "MSFT", "MRNA", "MDLZ", "MNST",
"NFLX", "NVDA", "NXPI", "ORLY", "ODFL", "ON", "PCAR", "PANW", "PAYX", "PYPL",
"PDD", "PEP", "QCOM", "REGN", "ROST", "SGEN", "SIRI", "SBUX", "SNPS", "TMUS",
"TSLA", "TXN", "TTD", "VRSK", "VRTX", "WBA", "WBD", "WDAY", "XEL", "ZM", "ZS"
]
# Prepare and process stock data
print("Preparing stock data...")
df_raw = prepare_stock_data(nasdaq_100_tickers)
print("Processing data with technical indicators...")
processed_full = process_data(df_raw)
# Split data into train and trade periods
print("Splitting data...")
train = data_split(processed_full, TRAIN_START_DATE, TRAIN_END_DATE)
trade = data_split(processed_full, TRADE_START_DATE, TRADE_END_DATE)
# Process and save with sentiment in chunks
print("Processing training data with sentiment...")
process_with_sentiment_chunks(
train,
'sentiment_deepseek.csv',
'train_data_deepseek_sentiment_2013_2018.csv'
)
print("Processing trading data with sentiment...")
process_with_sentiment_chunks(
trade,
'sentiment_deepseek.csv',
'trade_data_deepseek_sentiment_2019_2023.csv'
)
print("Processing complete!")
if __name__ == "__main__":
main()