-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_employment.py
executable file
·53 lines (37 loc) · 2.47 KB
/
process_employment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python
import argparse
description = '''
Gather employment history data for each person
The result is a table of entries, one for each occurance of an employment history entry
Columns:
ID year_job_started year_job_ended job_code night_shifts_worked consecutive_night_shifts_during_mixed_shifts consecutive_night_shifts_during_night_sihfts day_shifts_worked job_involved_shift_work mixture_of_day_and_night_shifts night_shifts_worked number_night_shifts_monthly_during_mixed_shifts number_night_shifts_monthly_during_night_shifts period_spent_working_day_shifts period_spent_working_mixed_shifts period_spent_working_night_shifts rest_days_during_mixed_shift_periods rest_days_during_night_shift_periods length_of_night_shift_during_mixed_shifts length_of_night_shift_during_night_shifts
So a participant may have multiple entries, one for each job worked
Note: many participants have no information about their job history
'''
parser = argparse.ArgumentParser(description)
parser.add_argument("-t", "--table", help="UKBB table file to read in", required=True)
parser.add_argument("-o", "--output", help="output tab-separated file to write to", required=True)
args = parser.parse_args()
import pandas
from fields_of_interest import employment_fields
# This file contain the information describing the fields in the UKBB
# http://biobank.ndph.ox.ac.uk/~bbdatan/Data_Dictionary_Showcase.csv
field_data = pandas.read_csv("metadata/Data_Dictionary_Showcase.csv", index_col=2)
num_entries = field_data.loc[employment_fields['year_job_started']].Array
processed_data = []
for data in pandas.read_csv(args.table, sep="\t", index_col=0, chunksize=10_000, low_memory=False):
data.index.rename("ID", inplace=True)
for i in range(num_entries):
fields = {name:f"f.{fnum}.0.{i}" for name,fnum in employment_fields.items()}
# Note: some fields are not present in later entries!
# this is because the column is removed if all their entries for all participants are blank
fields = {name:field for name,field in fields.items()
if field in data.columns}
fields_reversed = {field:name for name,field in fields.items()}
entry = data[fields.values()].rename(columns=fields_reversed)
valid = ~(entry.year_job_started.isna())
entry = entry[valid]
processed_data.append(entry)
all_data = pandas.concat(processed_data, sort=True)
all_data.sort_index(inplace=True, kind="mergesort")
all_data.to_csv(args.output, sep="\t")