Skip to content

Commit 677f3d5

Browse files
authored
Merge pull request ranaroussi#1660 from ranaroussi/fix/price-repair-100x-and-calibration
Fix price repair: 100x & calibration
2 parents e1f94ed + 4f9b05a commit 677f3d5

File tree

2 files changed

+126
-98
lines changed

2 files changed

+126
-98
lines changed
Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,24 @@
11
Date,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
2-
2022-06-06 00:00:00+01:00,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,0,0,0
3-
2022-06-01 00:00:00+01:00,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,0,0,0
4-
2022-05-31 00:00:00+01:00,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,0,0,0
5-
2022-05-30 00:00:00+01:00,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,0,0,0
6-
2022-05-27 00:00:00+01:00,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,0,0,0
7-
2022-05-26 00:00:00+01:00,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,0,0,0
8-
2022-05-25 00:00:00+01:00,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,0,0,0
9-
2022-05-24 00:00:00+01:00,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,0,0,0
10-
2022-05-23 00:00:00+01:00,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,0,0,0
11-
2022-05-20 00:00:00+01:00,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,0,0,0
12-
2022-05-19 00:00:00+01:00,14.55,14.55,14.55,14.55,14.55,0,0,0
13-
2022-05-18 00:00:00+01:00,14.55,14.55,14.55,14.55,14.55,532454,0,0
14-
2022-05-17 00:00:00+01:00,14.55,14.55,14.55,14.55,14.55,0,0,0
15-
2022-05-16 00:00:00+01:00,14.55,14.55,14.55,14.55,14.55,0,0,0
16-
2022-05-13 00:00:00+01:00,14.55,14.55,14.55,14.55,14.55,0,0,0
17-
2022-05-12 00:00:00+01:00,14.55,14.55,14.55,14.55,14.55,0,0,0
18-
2022-05-11 00:00:00+01:00,14.55,14.55,14.55,14.55,14.55,0,0,0
19-
2022-05-10 00:00:00+01:00,14.55,14.55,14.55,14.55,14.55,0,0,0
20-
2022-05-09 00:00:00+01:00,14.55,14.55,14.55,14.55,14.55,0,0,0
21-
2022-05-06 00:00:00+01:00,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,0,0,0
22-
2022-05-05 00:00:00+01:00,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,0,0,0
23-
2022-05-04 00:00:00+01:00,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,0,0,0
24-
2022-05-03 00:00:00+01:00,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,14.5500004291534,0,0,0
2+
2022-06-06 00:00:00+01:00,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0,0,0
3+
2022-06-01 00:00:00+01:00,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0,0,0
4+
2022-05-31 00:00:00+01:00,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0,0,0
5+
2022-05-30 00:00:00+01:00,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0,0,0
6+
2022-05-27 00:00:00+01:00,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0,0,0
7+
2022-05-26 00:00:00+01:00,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0,0,0
8+
2022-05-25 00:00:00+01:00,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0,0,0
9+
2022-05-24 00:00:00+01:00,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0,0,0
10+
2022-05-23 00:00:00+01:00,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0,0,0
11+
2022-05-20 00:00:00+01:00,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0,0,0
12+
2022-05-19 00:00:00+01:00,0.1455,0.1455,0.1455,0.1455,0.1455,0,0,0
13+
2022-05-18 00:00:00+01:00,0.1455,0.1455,0.1455,0.1455,0.1455,532454,0,0
14+
2022-05-17 00:00:00+01:00,0.1455,0.1455,0.1455,0.1455,0.1455,0,0,0
15+
2022-05-16 00:00:00+01:00,0.1455,0.1455,0.1455,0.1455,0.1455,0,0,0
16+
2022-05-13 00:00:00+01:00,0.1455,0.1455,0.1455,0.1455,0.1455,0,0,0
17+
2022-05-12 00:00:00+01:00,0.1455,0.1455,0.1455,0.1455,0.1455,0,0,0
18+
2022-05-11 00:00:00+01:00,0.1455,0.1455,0.1455,0.1455,0.1455,0,0,0
19+
2022-05-10 00:00:00+01:00,0.1455,0.1455,0.1455,0.1455,0.1455,0,0,0
20+
2022-05-09 00:00:00+01:00,0.1455,0.1455,0.1455,0.1455,0.1455,0,0,0
21+
2022-05-06 00:00:00+01:00,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0,0,0
22+
2022-05-05 00:00:00+01:00,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0,0,0
23+
2022-05-04 00:00:00+01:00,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0,0,0
24+
2022-05-03 00:00:00+01:00,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0.145500004291534,0,0,0

yfinance/base.py

Lines changed: 103 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -496,8 +496,8 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1):
496496
# Limit max reconstruction depth to 2:
497497
if self._reconstruct_start_interval is None:
498498
self._reconstruct_start_interval = interval
499-
if interval != self._reconstruct_start_interval:
500-
logger.debug(f"{self.ticker}: Price repair has hit max depth of 1 ('%s'->'%s')", self._reconstruct_start_interval, interval)
499+
if interval != self._reconstruct_start_interval and interval != nexts[self._reconstruct_start_interval]:
500+
logger.debug(f"{self.ticker}: Price repair has hit max depth of 2 ('%s'->'%s'->'%s')", self._reconstruct_start_interval, nexts[self._reconstruct_start_interval], interval)
501501
return df
502502

503503
df = df.sort_index()
@@ -760,7 +760,12 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1):
760760
weights = weights[:, None] # transpose
761761
weights = np.tile(weights, len(calib_cols)) # 1D -> 2D
762762
weights = weights[calib_filter] # flatten
763-
ratio = np.average(ratios, weights=weights)
763+
not1 = ~np.isclose(ratios, 1.0, rtol=0.00001)
764+
if np.sum(not1) == len(calib_cols):
765+
# Only 1 calibration row in df_new is different to df_block so ignore
766+
ratio = 1.0
767+
else:
768+
ratio = np.average(ratios, weights=weights)
764769
logger.debug(f"Price calibration ratio (raw) = {ratio:6f}")
765770
ratio_rcp = round(1.0 / ratio, 1)
766771
ratio = round(ratio, 1)
@@ -1235,19 +1240,26 @@ def _fix_prices_sudden_change(self, df, interval, tz_exchange, change, correct_v
12351240
# 100x errors into suspended intervals. Clue is no price change and 0 volume.
12361241
# Better to use last active trading interval as baseline.
12371242
f_no_activity = (df2['Low'] == df2['High']) & (df2['Volume']==0)
1243+
f_no_activity = f_no_activity | df2[OHLC].isna().all(axis=1)
12381244
appears_suspended = f_no_activity.any() and np.where(f_no_activity)[0][0]==0
12391245
f_active = ~f_no_activity
12401246
idx_latest_active = np.where(f_active & np.roll(f_active, 1))[0]
12411247
if len(idx_latest_active) == 0:
12421248
idx_latest_active = None
12431249
else:
1244-
idx_latest_active = idx_latest_active[0]
1245-
logger.debug(f'price-repair-split: appears_suspended={appears_suspended}, idx_latest_active={idx_latest_active}')
1250+
idx_latest_active = int(idx_latest_active[0])
1251+
log_msg = f'price-repair-split: appears_suspended={appears_suspended}, idx_latest_active={idx_latest_active}'
1252+
if idx_latest_active is not None:
1253+
log_msg += f' ({df.index[idx_latest_active].date()})'
1254+
logger.debug(log_msg)
12461255

12471256
if logger.isEnabledFor(logging.DEBUG):
12481257
df_debug = df2.copy()
1249-
df_debug = df_debug.drop(['Adj Close', 'Low', 'High', 'Volume', 'Dividends', 'Repaired?'], axis=1, errors='ignore')
1258+
df_debug = df_debug.drop(['Adj Close', 'Volume', 'Dividends', 'Repaired?'], axis=1, errors='ignore')
12501259
debug_cols = ['Low', 'High']
1260+
df_debug = df_debug.drop([c for c in OHLC if c not in debug_cols], axis=1, errors='ignore')
1261+
else:
1262+
debug_cols = []
12511263

12521264
# Calculate daily price % change. To reduce effect of price volatility,
12531265
# calculate change for each OHLC column.
@@ -1359,87 +1371,44 @@ def _fix_prices_sudden_change(self, df, interval, tz_exchange, change, correct_v
13591371
return df
13601372

13611373
# if logger.isEnabledFor(logging.DEBUG):
1362-
# logger.debug(f"price-repair-split: my workings:")
1363-
# logger.debug('\n' + str(df_debug))
1374+
# df_debug['i'] = list(range(0, df_debug.shape[0]))
1375+
# df_debug['i_rev'] = df_debug.shape[0]-1 - df_debug['i']
1376+
# with pd.option_context('display.max_rows', None, 'display.max_columns', 10, 'display.width', 1000): # more options can be specified also
1377+
# logger.debug(f"price-repair-split: my workings:" + '\n' + str(df_debug))
13641378

13651379
def map_signals_to_ranges(f, f_up, f_down):
1380+
# Ensure 0th element is False, because True is nonsense
1381+
if f[0]:
1382+
f = np.copy(f) ; f[0] = False
1383+
f_up = np.copy(f_up) ; f_up[0] = False
1384+
f_down = np.copy(f_down) ; f_down[0] = False
1385+
13661386
if not f.any():
13671387
return []
13681388

13691389
true_indices = np.where(f)[0]
13701390
ranges = []
13711391

1372-
idx_first_f = np.where(f)[0][0]
1373-
logger.debug(f'idx_latest_active={idx_latest_active} idx_first_f={idx_first_f}')
1374-
if appears_suspended and (idx_latest_active is None or idx_latest_active >= idx_first_f):
1375-
# baseline = 2nd index, because no active trading since latest split error
1376-
1377-
# First, process prices older than idx_latest_active:
1378-
if idx_latest_active is None:
1379-
true_indices_old = []
1380-
else:
1381-
true_indices_old = [i for i in true_indices if i > idx_latest_active]
1382-
if len(true_indices_old) > 0:
1383-
for i in range(len(true_indices_old) - 1):
1384-
if i % 2 == 0:
1385-
if split > 1.0:
1386-
adj = 'split' if f_down[true_indices_old[i]] else '1.0/split'
1387-
else:
1388-
adj = '1.0/split' if f_down[true_indices_old[i]] else 'split'
1389-
ranges.append((true_indices_old[i], true_indices_old[i+1], adj))
1390-
1391-
if len(true_indices_old) % 2 != 0:
1392-
if split > 1.0:
1393-
adj = 'split' if f_down[true_indices_old[-1]] else '1.0/split'
1394-
else:
1395-
adj = '1.0/split' if f_down[true_indices_old[-1]] else 'split'
1396-
ranges.append((true_indices_old[-1], len(f), adj))
1397-
1398-
# Next, process prices more recent than idx_latest_active:
1399-
true_indices_recent = [i for i in true_indices if i not in true_indices_old]
1400-
if len(true_indices_recent) > 0:
1392+
for i in range(len(true_indices) - 1):
1393+
if i % 2 == 0:
14011394
if split > 1.0:
1402-
adj = 'split' if f_up[true_indices_recent[0]] else '1.0/split'
1395+
adj = 'split' if f_down[true_indices[i]] else '1.0/split'
14031396
else:
1404-
adj = '1.0/split' if f_up[true_indices_recent[0]] else 'split'
1405-
ranges.append((0, true_indices_recent[0], adj))
1406-
1407-
for i in range(1, len(true_indices_recent) - 1):
1408-
if i % 2 == 1:
1409-
if split > 1.0:
1410-
adj = '1.0/split' if f_up[true_indices_recent[i]] else 'split'
1411-
else:
1412-
adj = 'split' if f_up[true_indices_recent[i]] else '1.0/split'
1413-
ranges.append((true_indices_recent[i], true_indices_recent[i + 1], adj))
1414-
1415-
if len(true_indices_recent) % 2 == 0:
1416-
if split > 1.0:
1417-
adj = 'split' if f_down[true_indices_recent[-1]] else '1.0/split'
1418-
else:
1419-
adj = '1.0/split' if f_down[true_indices_recent[-1]] else 'split'
1420-
ranges.append((true_indices_recent[-1], len(f), adj))
1421-
1422-
ranges = sorted(ranges, key=lambda x: x[0])
1423-
1424-
else:
1425-
# baseline = 2nd index
1426-
for i in range(len(true_indices) - 1):
1427-
if i % 2 == 0:
1428-
if split > 1.0:
1429-
adj = 'split' if f_down[true_indices[i]] else '1.0/split'
1430-
else:
1431-
adj = '1.0/split' if f_down[true_indices[i]] else 'split'
1432-
ranges.append((true_indices[i], true_indices[i + 1], adj))
1397+
adj = '1.0/split' if f_down[true_indices[i]] else 'split'
1398+
ranges.append((true_indices[i], true_indices[i + 1], adj))
14331399

1434-
if len(true_indices) % 2 != 0:
1435-
if split > 1.0:
1436-
adj = 'split' if f_down[true_indices[-1]] else '1.0/split'
1437-
else:
1438-
adj = '1.0/split' if f_down[true_indices[-1]] else 'split'
1439-
ranges.append((true_indices[-1], len(f), adj))
1400+
if len(true_indices) % 2 != 0:
1401+
if split > 1.0:
1402+
adj = 'split' if f_down[true_indices[-1]] else '1.0/split'
1403+
else:
1404+
adj = '1.0/split' if f_down[true_indices[-1]] else 'split'
1405+
ranges.append((true_indices[-1], len(f), adj))
14401406

14411407
return ranges
14421408

1409+
if idx_latest_active is not None:
1410+
idx_rev_latest_active = df.shape[0] - 1 - idx_latest_active
1411+
logger.debug(f'price-repair-split: idx_latest_active={idx_latest_active}, idx_rev_latest_active={idx_rev_latest_active}')
14431412
if correct_columns_individually:
14441413
f_corrected = np.full(n, False)
14451414
if correct_volume:
@@ -1455,7 +1424,38 @@ def map_signals_to_ranges(f, f_up, f_down):
14551424
OHLC_correct_ranges = [None, None, None, None]
14561425
for j in range(len(OHLC)):
14571426
c = OHLC[j]
1458-
ranges = map_signals_to_ranges(f[:, j], f_up[:, j], f_down[:, j])
1427+
idx_first_f = np.where(f)[0][0]
1428+
if appears_suspended and (idx_latest_active is not None and idx_latest_active >= idx_first_f):
1429+
# Suspended midway during data date range.
1430+
# 1: process data before suspension in index-ascending (date-descending) order.
1431+
# 2: process data after suspension in index-descending order. Requires signals to be reversed,
1432+
# then returned ranges to also be reversed, because this logic was originally written for
1433+
# index-ascending (date-descending) order.
1434+
fj = f[:, j]
1435+
f_upj = f_up[:, j]
1436+
f_downj = f_down[:, j]
1437+
ranges_before = map_signals_to_ranges(fj[idx_latest_active:], f_upj[idx_latest_active:], f_downj[idx_latest_active:])
1438+
if len(ranges_before) > 0:
1439+
# Shift each range back to global indexing
1440+
for i in range(len(ranges_before)):
1441+
r = ranges_before[i]
1442+
ranges_before[i] = (r[0] + idx_latest_active, r[1] + idx_latest_active, r[2])
1443+
f_rev_downj = np.flip(np.roll(f_upj, -1)) # correct
1444+
f_rev_upj = np.flip(np.roll(f_downj, -1)) # correct
1445+
f_revj = f_rev_upj | f_rev_downj
1446+
ranges_after = map_signals_to_ranges(f_revj[idx_rev_latest_active:], f_rev_upj[idx_rev_latest_active:], f_rev_downj[idx_rev_latest_active:])
1447+
if len(ranges_after) > 0:
1448+
# Shift each range back to global indexing:
1449+
for i in range(len(ranges_after)):
1450+
r = ranges_after[i]
1451+
ranges_after[i] = (r[0] + idx_rev_latest_active, r[1] + idx_rev_latest_active, r[2])
1452+
# Flip range to normal ordering
1453+
for i in range(len(ranges_after)):
1454+
r = ranges_after[i]
1455+
ranges_after[i] = (n-r[1], n-r[0], r[2])
1456+
ranges = ranges_before ; ranges.extend(ranges_after)
1457+
else:
1458+
ranges = map_signals_to_ranges(f[:, j], f_up[:, j], f_down[:, j])
14591459
logger.debug(f"column '{c}' ranges: {ranges}")
14601460
if start_min is not None:
14611461
# Prune ranges that are older than start_min
@@ -1514,7 +1514,35 @@ def map_signals_to_ranges(f, f_up, f_down):
15141514
df2.loc[f_corrected, 'Repaired?'] = True
15151515

15161516
else:
1517-
ranges = map_signals_to_ranges(f, f_up, f_down)
1517+
idx_first_f = np.where(f)[0][0]
1518+
if appears_suspended and (idx_latest_active is not None and idx_latest_active >= idx_first_f):
1519+
# Suspended midway during data date range.
1520+
# 1: process data before suspension in index-ascending (date-descending) order.
1521+
# 2: process data after suspension in index-descending order. Requires signals to be reversed,
1522+
# then returned ranges to also be reversed, because this logic was originally written for
1523+
# index-ascending (date-descending) order.
1524+
ranges_before = map_signals_to_ranges(f[idx_latest_active:], f_up[idx_latest_active:], f_down[idx_latest_active:])
1525+
if len(ranges_before) > 0:
1526+
# Shift each range back to global indexing
1527+
for i in range(len(ranges_before)):
1528+
r = ranges_before[i]
1529+
ranges_before[i] = (r[0] + idx_latest_active, r[1] + idx_latest_active, r[2])
1530+
f_rev_down = np.flip(np.roll(f_up, -1))
1531+
f_rev_up = np.flip(np.roll(f_down, -1))
1532+
f_rev = f_rev_up | f_rev_down
1533+
ranges_after = map_signals_to_ranges(f_rev[idx_rev_latest_active:], f_rev_up[idx_rev_latest_active:], f_rev_down[idx_rev_latest_active:])
1534+
if len(ranges_after) > 0:
1535+
# Shift each range back to global indexing:
1536+
for i in range(len(ranges_after)):
1537+
r = ranges_after[i]
1538+
ranges_after[i] = (r[0] + idx_rev_latest_active, r[1] + idx_rev_latest_active, r[2])
1539+
# Flip range to normal ordering
1540+
for i in range(len(ranges_after)):
1541+
r = ranges_after[i]
1542+
ranges_after[i] = (n-r[1], n-r[0], r[2])
1543+
ranges = ranges_before ; ranges.extend(ranges_after)
1544+
else:
1545+
ranges = map_signals_to_ranges(f, f_up, f_down)
15181546
if start_min is not None:
15191547
# Prune ranges that are older than start_min
15201548
for i in range(len(ranges)-1, -1, -1):

0 commit comments

Comments
 (0)