fix for weekly dash getting NaT date values

djay · Dec 4, 2024 · 4255c19 · 4255c19 · github-actions · Dec 4, 2024
1 parent 7e700b8
commit 4255c19
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 8 deletions.
diff --git a/covid_data_dash.py b/covid_data_dash.py
@@ -273,6 +273,7 @@ def dash_weekly(file="moph_dash_weekly"):
         # We miss data not effected by wave
         row_update = extract_basics(wb, date, check_date=False)
         assert not row_update.empty
+        assert np.nan not in row_update.index and pd.NaT not in row_update.index
         # row = row_update.combine_first(row_since2023)
         row = row_update
 
@@ -389,6 +390,7 @@ def dash_province_weekly(file="moph_province_weekly"):
 def extract_basics(wb, date, check_date=True, base_df=None):
 
     row = pd.DataFrame()
+    maxdate = None
     # D_CaseNew_/7 - daily avg cases
     # D_DeathNew_/7 - daily avg deaths
     # D_Death (2)
@@ -438,6 +440,7 @@ def to_cum(cum, periodic, name):
     cases = weeks_to_end_date(cases, year_col="Year", week_col="Week", offset=0, date=date)
     if cases.empty and base_df is not None and 'Cases' in base_df.columns:
         cases = base_df[['Cases']]
+        maxdate = cases.index.max()
 
     deaths = workbook_series(wb, ["D_DeathTL (2)", "D2_DeathTL (2)"], {
         "SUM(death_new)-value": "Deaths",
@@ -447,6 +450,9 @@ def to_cum(cum, periodic, name):
     deaths = weeks_to_end_date(deaths, year_col="Year", week_col="Week", offset=0, date=date)
     if deaths.empty and base_df is not None and 'deaths' in base_df.columns:
         deaths = base_df[['Deaths']]
+        maxdate = deaths.index.max()
+
+    assert maxdate is not pd.NaT or maxdate is not np.nan
 
     # There is no date in the data to tell us that its returning the correct data except for the
     # the deaths and cases. lets just look if we got latest instead.
@@ -470,13 +476,15 @@ def to_cum(cum, periodic, name):
                              "Measure Names-alias": "Gender"}, index_col="Gender", index_date=False)
 
     if not ages.empty:
-        ages['Date'] = deaths.index.max()
+        ages['Date'] = date
         ages = ages.reset_index().pivot(columns=['Age Group'], values=["Deaths"], index=["Date"])
         ages.columns = [f"Deaths Age {a}" for a in ["0-4", "10-19", "20-49", "5-9", "50-59", "60-69", "70+"]]
-        gender['Date'] = deaths.index.max()
+        row = row.combine_first(ages)
+    if not gender.empty:
+        gender['Date'] = date
         gender = gender.reset_index().pivot(columns=['Gender'], values=["Deaths"], index=["Date"])
         gender.columns = [f"Deaths {g}" for g in ["Male", "Female"]]
-        row = row.combine_first(gender).combine_first(ages)
+        row = row.combine_first(gender)
 
     # TODO: should switch from weekly?
     row = row.combine_first(vacs).combine_first(vacs_dates)
@@ -776,25 +784,26 @@ def check_dash_ready():
 if __name__ == '__main__':
     # check_dash_ready()
 
-    dash_daily_df = dash_weekly()
-    dash_by_province_df = dash_province_weekly()
-    dash_by_province_daily = dash_by_province()
     # dash_ages_df = dash_ages()
 
     # This doesn't add any more info since severe cases was a mistake
-#    dash_trends_prov_df = dash_trends_prov()
+    #    dash_trends_prov_df = dash_trends_prov()
 
     df = import_csv("combined", index=["Date"], date_cols=["Date"])
     prov = import_csv("cases_by_province", index=["Date", "Province"], date_cols=["Date"])
 
+    dash_by_province_df = dash_province_weekly()
     # df = dash.combine(df, lambda s1, s2: s1)
     # df = briefings.combine(df, lambda s1, s2: s1)
     vaccols = [f"Vac Given {d} Cum" for d in range(1, 5)]
     daily_prov = cum2daily(dash_by_province_df, exclude=vaccols)
+    dash_by_province_daily = dash_by_province()
     prov = dash_by_province_daily.combine_first(daily_prov).combine_first(prov)
     # Write this one as it's imported
     export(prov, "cases_by_province", csv_only=True)
 
+    dash_daily_df = dash_weekly()
+    dash_daily_df = dash_daily_df.loc[dash_daily_df.index.notnull()]  # remove some bad data
     daily = cum2daily(dash_daily_df, exclude=vaccols)
     daily_deaths = weekly2daily(dash_daily_df[(c for c in dash_daily_df.columns if "Deaths " in c)])
     df = daily.combine_first(df).combine_first(daily_deaths)

diff --git a/utils_pandas.py b/utils_pandas.py
@@ -72,6 +72,8 @@ def todaily(df_cum):
         cum = df_cum.reset_index(otherindex)
         othervals = cum[otherindex]
         cum = cum[[c for c in cols if c not in otherindex]]
+        # remove any bad time values
+        cum = cum.loc[cum.index.notnull()]
 
         all_days = pd.date_range(cum.index.min(), cum.index.max(), name="Date")
         cum = cum.reindex(all_days)  # put in missing days with NaN
@@ -81,7 +83,9 @@ def todaily(df_cum):
         renames = dict((c, c.rstrip(' Cum')) for c in list(daily.columns) if 'Cum' in c)
         daily = daily.rename(columns=renames)
         assert not (daily < 0).any().any()
-        daily[otherindex] = othervals.iloc[0]  # Should all be the same
+        # set to all the first province as they should all be the same.
+        for i in otherindex:
+            daily.insert(0, i, othervals.iloc[0][i])
         daily = daily.reset_index().set_index(["Date"] + otherindex)
         if not drop:
             # add back in the cum valuse
@@ -735,5 +739,6 @@ def weeks_to_end_date(df, week_col="Week", year_col="year", offset=0, date=None)
     #     f"{row[year_col] if year_col else year}-W{int(row[week_col])}-6", "%Y-W%W-%w") - datetime.timedelta(days=offset), axis=1)
     df["Date"] = pd.to_datetime(df[year_col].astype(str) + df[week_col].astype(str) +
                                 "-6", format='%Y%U-%w') - DateOffset(days=offset)
+    assert np.nan not in df['Date'] and pd.NaT not in df['Date']
     df = df.drop(columns=set(df.columns).intersection(set([week_col, year_col, None])))
     return df.set_index(["Date"] + otherindex)
diff --git a/utils_scraping_tableau.py b/utils_scraping_tableau.py
@@ -110,6 +110,7 @@ def workbook_series(wb, name, mappings, defaults={"": 0.0}, index_col="Date", en
         df.columns = df.columns.map(' '.join)
         df = df.reset_index()
     df = df.set_index(index_col)
+    assert np.nan not in df.index
     # This seems to be 0 in these graphs. and if we don't then any bad previous values won't get corrected. TODO: param depeden
     if type(defaults) != dict:
         default = [defaults] * len(df.columns)