feat: add PIT OHLCV runner and fetch support

2026-04-18 14:59:48 +08:00
parent c015873ee1
commit f5e8c708f3
5 changed files with 221 additions and 16 deletions
--- a/research/fetch_historical.py
+++ b/research/fetch_historical.py
@@ -25,6 +25,15 @@ YEARS = 10
 BATCH_SIZE = 50


+def _field_out_paths() -> dict[str, str]:
+    return {
+        "Close": os.path.join(DATA_DIR, "us_pit_close.csv"),
+        "High": os.path.join(DATA_DIR, "us_pit_high.csv"),
+        "Low": os.path.join(DATA_DIR, "us_pit_low.csv"),
+        "Volume": os.path.join(DATA_DIR, "us_pit_volume.csv"),
+    }
+
+
 def fetch_all_historical(force: bool = False) -> pd.DataFrame:
    os.makedirs(DATA_DIR, exist_ok=True)
    intervals = uh.load_sp500_history()
@@ -74,8 +83,41 @@ def fetch_all_historical(force: bool = False) -> pd.DataFrame:
    return combined


+def fetch_all_historical_ohlcv(force: bool = False) -> dict[str, pd.DataFrame]:
+    os.makedirs(DATA_DIR, exist_ok=True)
+    intervals = uh.load_sp500_history()
+    tickers = uh.all_tickers_ever(intervals) + ["SPY"]
+    tickers = sorted(set(tickers))
+    start = (datetime.now() - timedelta(days=365 * YEARS)).strftime("%Y-%m-%d")
+    panels = _download_batched_fields(tickers, start=start, fields=["Close", "High", "Low", "Volume"])
+    if not panels:
+        raise RuntimeError("No PIT OHLCV data downloaded")
+
+    close = panels["Close"]
+    close.to_csv(OUT_PATH)
+    print(f"--- Saved {close.shape} to {OUT_PATH} ---")
+    result: dict[str, pd.DataFrame] = {"close": close}
+    for field, path in _field_out_paths().items():
+        panel = panels[field]
+        panel.to_csv(path)
+        print(f"--- Saved {panel.shape} to {path} ---")
+        result[field.lower()] = panel
+    return result
+
+
 def _download_batched(tickers: list[str], start: str) -> pd.DataFrame | None:
-    frames = []
+    panels = _download_batched_fields(tickers, start=start, fields=["Close"])
+    if not panels:
+        return None
+    return panels["Close"]
+
+
+def _download_batched_fields(
+    tickers: list[str],
+    start: str,
+    fields: list[str],
+) -> dict[str, pd.DataFrame]:
+    frames = {field: [] for field in fields}
    n = len(tickers)
    for i in range(0, n, BATCH_SIZE):
        batch = tickers[i:i + BATCH_SIZE]
@@ -85,19 +127,24 @@ def _download_batched(tickers: list[str], start: str) -> pd.DataFrame | None:
                              progress=False, threads=True)
            if raw.empty:
                continue
-            if isinstance(raw.columns, pd.MultiIndex):
-                close = raw["Close"]
-            else:
-                close = raw[["Close"]].rename(columns={"Close": batch[0]})
-            close = close.dropna(axis=1, how="all")
-            if not close.empty:
-                frames.append(close)
+            for field in fields:
+                if isinstance(raw.columns, pd.MultiIndex):
+                    panel = raw[field]
+                else:
+                    panel = raw[[field]].rename(columns={field: batch[0]})
+                panel = panel.dropna(axis=1, how="all")
+                if not panel.empty:
+                    frames[field].append(panel)
        except Exception as e:
            print(f"    batch failed: {e}")
-    if not frames:
-        return None
-    result = pd.concat(frames, axis=1).sort_index()
-    result = result.loc[:, ~result.columns.duplicated()]
+    result = {}
+    for field, field_frames in frames.items():
+        if field_frames:
+            panel = pd.concat(field_frames, axis=1).sort_index()
+            panel = panel.loc[:, ~panel.columns.duplicated()]
+            result[field] = panel
+        else:
+            result[field] = pd.DataFrame()
    return result