#!/usr/bin/python3 -OO ''' Merge the daily-updated data from the covid19 page to that from the mortality page, which gets updated less often. One problem is to convert the month/day/year format to the "week of flu season" number. Week 16 is the week ending April 18, 2020. ''' import sys, os, csv, re, logging from datetime import datetime, timedelta from importdata2 import HEADERS as CSVHEADERS from importtsv import HEADERS as TSVHEADERS logging.basicConfig(level=logging.DEBUG if __debug__ else logging.INFO) WEEK = timedelta(days=7) WEEK16 = datetime(year=2020, month=4, day=18) WEEKHEADER = 'Week ending date in which the death occurred' TSVDEATHS = 'Deaths from All Causes' TSVPERCENT = 'Percent of Expected Deaths2' CSVDEATHS = 'TOTAL DEATHS' CSVPERCENT = 'PERCENT COMPLETE' def tsvread(tsvfile): ''' Read in a TSV file. ''' with open(tsvfile, 'r') as infile: tsvin = csv.reader(infile, delimiter='\t') rows = [row for row in tsvin] headers = rows.pop(0) tsvdata = [dict(zip(headers, row)) for row in rows] return tsvdata def csvread(csvfile): ''' Read in a CSV file. ''' with open(csvfile, 'r') as infile: csvin = csv.reader(infile) rows = [row for row in csvin] headers = rows.pop(0) csvdata = [dict(zip(headers, row)) for row in rows] return csvdata def double(number): ''' Get rid of spurious characters and turn numeric string to a float ''' return float(re.compile('[0-9,.]+').search(number).group().replace(',', '')) def greater(csvrow, tsvrow): ''' Return csvrow with greater deaths number from two rows. Side effect: csvrow is already mutated with updated values. Note: The percentage in the `fluview` CSV file is different from that in the TSV data from NCHS: the former is the "percent complete" and the latter "expected deaths". So while it *may* be valid to multiply by the inverse of the percentage in the previous case, it is dubious in the latter. So we will mark it as being 100% complete to avoid misleading numbers. ''' csvdeaths = double(csvrow[CSVDEATHS]) csvpercent = double(csvrow[CSVPERCENT]) tsvdeaths = double(tsvrow[TSVDEATHS]) tsvpercent = double(tsvrow[TSVPERCENT]) csvvalue = (csvdeaths if csvpercent >= 100 else csvdeaths * (100 / csvpercent)) if tsvdeaths > csvvalue: logging.debug('replacing %s with %s', csvvalue, tsvdeaths) csvrow[CSVDEATHS] = tsvrow[TSVDEATHS] csvrow[CSVPERCENT] = '100%' return csvrow # just for debugging; the dict has already been mutated def merge(csvfile, tsvfile, mergedfile): ''' Put highest number from TSV and CSV data into merged CSV file. ''' csvdata = csvread(csvfile) tsvdata = tsvread(tsvfile) mapping = {'16': WEEK16} for week in range(1, 16): mapping[str(week)] = WEEK16 - (WEEK * (16 - week)) for week in range(17, 40): mapping[str(week)] = WEEK16 + (WEEK * (week - 16)) logging.debug('mapping: %s', mapping) tsvdict = {datetime.strptime(row[WEEKHEADER], '%m/%d/%Y'): row for row in tsvdata if row[WEEKHEADER] != 'Total Deaths'} csvdict = {mapping[row['WEEK']]: row for row in csvdata if row['AREA'] == 'National' and row['SEASON'] == '2019-20' and int(row['WEEK']) < 40} for week, row in csvdict.items(): if week in tsvdict: row.update(greater(row, tsvdict[week])) with open(mergedfile, 'w') as outfile: csvout = csv.writer(outfile) csvout.writerows([CSVHEADERS] + [[row[header] for header in CSVHEADERS] for row in csvdata]) if __name__ == '__main__': merge(*sys.argv[1:])