Skip to content

Commit

Permalink
fix for weekly dash getting NaT date values
Browse files Browse the repository at this point in the history
  • Loading branch information
djay committed Dec 4, 2024
1 parent 7e700b8 commit 4255c19
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 8 deletions.
23 changes: 16 additions & 7 deletions covid_data_dash.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ def dash_weekly(file="moph_dash_weekly"):
# We miss data not effected by wave
row_update = extract_basics(wb, date, check_date=False)
assert not row_update.empty
assert np.nan not in row_update.index and pd.NaT not in row_update.index
# row = row_update.combine_first(row_since2023)
row = row_update

Expand Down Expand Up @@ -389,6 +390,7 @@ def dash_province_weekly(file="moph_province_weekly"):
def extract_basics(wb, date, check_date=True, base_df=None):

row = pd.DataFrame()
maxdate = None
# D_CaseNew_/7 - daily avg cases
# D_DeathNew_/7 - daily avg deaths
# D_Death (2)
Expand Down Expand Up @@ -438,6 +440,7 @@ def to_cum(cum, periodic, name):
cases = weeks_to_end_date(cases, year_col="Year", week_col="Week", offset=0, date=date)
if cases.empty and base_df is not None and 'Cases' in base_df.columns:
cases = base_df[['Cases']]
maxdate = cases.index.max()

deaths = workbook_series(wb, ["D_DeathTL (2)", "D2_DeathTL (2)"], {
"SUM(death_new)-value": "Deaths",
Expand All @@ -447,6 +450,9 @@ def to_cum(cum, periodic, name):
deaths = weeks_to_end_date(deaths, year_col="Year", week_col="Week", offset=0, date=date)
if deaths.empty and base_df is not None and 'deaths' in base_df.columns:
deaths = base_df[['Deaths']]
maxdate = deaths.index.max()

assert maxdate is not pd.NaT or maxdate is not np.nan

# There is no date in the data to tell us that its returning the correct data except for the
# the deaths and cases. lets just look if we got latest instead.
Expand All @@ -470,13 +476,15 @@ def to_cum(cum, periodic, name):
"Measure Names-alias": "Gender"}, index_col="Gender", index_date=False)

if not ages.empty:
ages['Date'] = deaths.index.max()
ages['Date'] = date
ages = ages.reset_index().pivot(columns=['Age Group'], values=["Deaths"], index=["Date"])
ages.columns = [f"Deaths Age {a}" for a in ["0-4", "10-19", "20-49", "5-9", "50-59", "60-69", "70+"]]
gender['Date'] = deaths.index.max()
row = row.combine_first(ages)
if not gender.empty:
gender['Date'] = date
gender = gender.reset_index().pivot(columns=['Gender'], values=["Deaths"], index=["Date"])
gender.columns = [f"Deaths {g}" for g in ["Male", "Female"]]
row = row.combine_first(gender).combine_first(ages)
row = row.combine_first(gender)

# TODO: should switch from weekly?
row = row.combine_first(vacs).combine_first(vacs_dates)
Expand Down Expand Up @@ -776,25 +784,26 @@ def check_dash_ready():
if __name__ == '__main__':
# check_dash_ready()

dash_daily_df = dash_weekly()
dash_by_province_df = dash_province_weekly()
dash_by_province_daily = dash_by_province()
# dash_ages_df = dash_ages()

# This doesn't add any more info since severe cases was a mistake
# dash_trends_prov_df = dash_trends_prov()
# dash_trends_prov_df = dash_trends_prov()

df = import_csv("combined", index=["Date"], date_cols=["Date"])
prov = import_csv("cases_by_province", index=["Date", "Province"], date_cols=["Date"])

dash_by_province_df = dash_province_weekly()
# df = dash.combine(df, lambda s1, s2: s1)
# df = briefings.combine(df, lambda s1, s2: s1)
vaccols = [f"Vac Given {d} Cum" for d in range(1, 5)]
daily_prov = cum2daily(dash_by_province_df, exclude=vaccols)
dash_by_province_daily = dash_by_province()
prov = dash_by_province_daily.combine_first(daily_prov).combine_first(prov)
# Write this one as it's imported
export(prov, "cases_by_province", csv_only=True)

dash_daily_df = dash_weekly()
dash_daily_df = dash_daily_df.loc[dash_daily_df.index.notnull()] # remove some bad data
daily = cum2daily(dash_daily_df, exclude=vaccols)
daily_deaths = weekly2daily(dash_daily_df[(c for c in dash_daily_df.columns if "Deaths " in c)])
df = daily.combine_first(df).combine_first(daily_deaths)
Expand Down
7 changes: 6 additions & 1 deletion utils_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ def todaily(df_cum):
cum = df_cum.reset_index(otherindex)
othervals = cum[otherindex]
cum = cum[[c for c in cols if c not in otherindex]]
# remove any bad time values
cum = cum.loc[cum.index.notnull()]

all_days = pd.date_range(cum.index.min(), cum.index.max(), name="Date")
cum = cum.reindex(all_days) # put in missing days with NaN
Expand All @@ -81,7 +83,9 @@ def todaily(df_cum):
renames = dict((c, c.rstrip(' Cum')) for c in list(daily.columns) if 'Cum' in c)
daily = daily.rename(columns=renames)
assert not (daily < 0).any().any()
daily[otherindex] = othervals.iloc[0] # Should all be the same
# set to all the first province as they should all be the same.
for i in otherindex:
daily.insert(0, i, othervals.iloc[0][i])
daily = daily.reset_index().set_index(["Date"] + otherindex)
if not drop:
# add back in the cum valuse
Expand Down Expand Up @@ -735,5 +739,6 @@ def weeks_to_end_date(df, week_col="Week", year_col="year", offset=0, date=None)
# f"{row[year_col] if year_col else year}-W{int(row[week_col])}-6", "%Y-W%W-%w") - datetime.timedelta(days=offset), axis=1)
df["Date"] = pd.to_datetime(df[year_col].astype(str) + df[week_col].astype(str) +
"-6", format='%Y%U-%w') - DateOffset(days=offset)
assert np.nan not in df['Date'] and pd.NaT not in df['Date']
df = df.drop(columns=set(df.columns).intersection(set([week_col, year_col, None])))
return df.set_index(["Date"] + otherindex)
1 change: 1 addition & 0 deletions utils_scraping_tableau.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def workbook_series(wb, name, mappings, defaults={"": 0.0}, index_col="Date", en
df.columns = df.columns.map(' '.join)
df = df.reset_index()
df = df.set_index(index_col)
assert np.nan not in df.index
# This seems to be 0 in these graphs. and if we don't then any bad previous values won't get corrected. TODO: param depeden
if type(defaults) != dict:
default = [defaults] * len(df.columns)
Expand Down

9 comments on commit 4255c19

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.