-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_self_reported_conditions.py
executable file
·53 lines (40 loc) · 1.94 KB
/
process_self_reported_conditions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python
import argparse
description = '''
Gather self-reported conditions for an individual
The result is a table of entries, one for each reported non-cancer condition (field 20002)
ID condition_code
So a participant may have multiple entries, or none if they reported no conditions
'''
parser = argparse.ArgumentParser(description)
parser.add_argument("-t", "--table", help="UKBB table file to read in", required=True)
parser.add_argument("-o", "--output", help="output tab-separated file to write to", required=True)
args = parser.parse_args()
SELF_REPORTED_CONDITION_FIELD = 20002
YEAR_FIRST_DIAGNOSED_FIELD = 20008
YEAR_UNKNOWN_CODES = [-1, -3]
import pandas
# These two files contain the information describing the fields in the UKBB
# they are downloaded from:
# http://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=6
# http://biobank.ndph.ox.ac.uk/~bbdatan/Data_Dictionary_Showcase.csv
#codings = pandas.read_csv("metadata/self_reported_conditions_coding.txt", index_col=0) # NOTE: not currently used but handy for analysis
fields = pandas.read_csv("metadata/Data_Dictionary_Showcase.csv", index_col=2)
num_entries = fields.loc[SELF_REPORTED_CONDITION_FIELD].Array
processed_data = []
for data in pandas.read_csv(args.table, sep="\t", index_col=0, chunksize=10_000, low_memory=False):
data.index.rename("ID", inplace=True)
for i in range(num_entries):
condition_field = f"f.{SELF_REPORTED_CONDITION_FIELD}.0.{i}"
year_field = f"f.{YEAR_FIRST_DIAGNOSED_FIELD}.0.{i}"
code = data[condition_field]
year = data[year_field]
year[year.isin(YEAR_UNKNOWN_CODES)] = float("NaN")
valid = ~code.isna()
code = code[valid]
year = year[valid]
entries = pandas.DataFrame({"condition_code": code, "year": year})
processed_data.append(entries)
all_data = pandas.concat(processed_data)
all_data.sort_index(inplace=True, kind="mergesort")
all_data.to_csv(args.output, sep="\t")