Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Multi-Cardinality Support to DedupliFHIR Backend #122

Merged
merged 9 commits into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 91 additions & 41 deletions cli/deduplifhirLib/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,38 +32,90 @@
#blocking_rules = list(
# map(block_on,blocking_rules))

blocking_rules = []
for rule in BLOCKING_RULE_STRINGS:
if isinstance(rule, list):
blocking_rules.append(block_on(*rule))
else:
blocking_rules.append(block_on(rule))
def get_additional_comparison_rules(parsed_data_df):
IsaacMilarky marked this conversation as resolved.
Show resolved Hide resolved
"""
This function generates appropriate comparison rules based on pandas column names

Arguments:
parsed_data_df: The dataframe that was parsed from the user that we want to
find duplicates in

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pylint] reported by reviewdog 🐶
C0303: Trailing whitespace (trailing-whitespace)

Returns:
A generator collection object full of splink comparison objects
"""

parsed_data_columns = parsed_data_df.columns

for col in parsed_data_columns:
if 'street_address' in col:
yield cl.ExactMatch(col).configure(term_frequency_adjustments=True)
elif 'postal_code' in col:
yield cl.PostcodeComparison(col)

def create_settings(parsed_data_df):
IsaacMilarky marked this conversation as resolved.
Show resolved Hide resolved
"""
This function generates a Splink SettingsCreator object based on the parsed
input data's columns and the blocking settings in splink_settings.json

Arguments:
parsed_data_df: The dataframe that was parsed from the user that we want to
find duplicates in

Returns:
A splink SettingsCreator object to be used with a splink linker object
"""

blocking_rules = []
for rule in BLOCKING_RULE_STRINGS:
if isinstance(rule, list):
blocking_rules.append(block_on(*rule))
else:
blocking_rules.append(block_on(rule))

comparison_rules = [item for item in get_additional_comparison_rules(parsed_data_df)]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pylint] reported by reviewdog 🐶
R1721: Unnecessary use of a comprehension, use list(get_additional_comparison_rules(parsed_data_df)) instead. (unnecessary-comprehension)

comparison_rules.extend([
cl.ExactMatch("phone").configure(
term_frequency_adjustments=True
),
cl.NameComparison("given_name").configure(
term_frequency_adjustments=True
),
cl.NameComparison("family_name").configure(
term_frequency_adjustments=True
),
cl.DateOfBirthComparison("birth_date",input_is_string=True)]
)


return SettingsCreator(
link_type=splink_settings_dict["link_type"],
blocking_rules_to_generate_predictions=blocking_rules,
comparisons=comparison_rules,
max_iterations=splink_settings_dict["max_iterations"],
em_convergence=splink_settings_dict["em_convergence"])

comparison_rules = [
cl.ExactMatch("street_address").configure(
term_frequency_adjustments=True
),
cl.ExactMatch("phone").configure(
term_frequency_adjustments=True
),
cl.NameComparison("given_name").configure(
term_frequency_adjustments=True
),
cl.NameComparison("family_name").configure(
term_frequency_adjustments=True
),
cl.DateOfBirthComparison("birth_date",input_is_string=True),
cl.PostcodeComparison("postal_code")
]


SPLINK_LINKER_SETTINGS_PATIENT_DEDUPE = SettingsCreator(
link_type=splink_settings_dict["link_type"],
blocking_rules_to_generate_predictions=blocking_rules,
comparisons=comparison_rules,
max_iterations=splink_settings_dict["max_iterations"],
em_convergence=splink_settings_dict["em_convergence"])
def parse_fhir_dates(fhir_json_obj):
IsaacMilarky marked this conversation as resolved.
Show resolved Hide resolved
"""
A generator function that parses the address portion of a FHIR file
into a dictionary object that can be added to the overall patient record

Arguments:
fhir_json_obj: The object that has been parsed from the FHIR data

Returns:
A generator containing dictionaries of address data.
"""
addresses = fhir_json_obj['entry'][0]['resource']['address']

for addr,n in enumerate(addresses):
yield {
f"street_address{n}": [normalize_addr_text(''.join(addr['line']))],
f"city{n}": [normalize_addr_text(addr['city'])],
f"state{n}": [normalize_addr_text(addr['state'])],
f"postal_code{n}": [normalize_addr_text(addr['postalCode'])]
}



Expand Down Expand Up @@ -103,21 +155,19 @@ def read_fhir_data(patient_record_path):
patient_json_record['entry'][0]['resource']['birthDate']
),
"phone": [patient_json_record['entry'][0]['resource']['telecom'][0]['value']],
"street_address": [
normalize_addr_text(
patient_json_record['entry'][0]['resource']['address'][0]['line'][0]
)
],
"city": [
normalize_addr_text(patient_json_record['entry'][0]['resource']['address'][0]['city'])
],
"state": [
normalize_addr_text(patient_json_record['entry'][0]['resource']['address'][0]['state'])
],
"postal_code": [patient_json_record['entry'][0]['resource']['address'][0]['postalCode']],
"ssn": [patient_json_record['entry'][0]['resource']['identifier'][1]['value']],
"path": patient_record_path
}
#print(patient_dict)

try:
patient_dict["middle_name"] = [
normalize_name_text(patient_json_record['entry'][0]['resouce']['name'][0]['given'][1])
]
except IndexError:
patient_dict["middle_name"] = [""]
print("no middle name found!")

for date in parse_fhir_dates(patient_json_record):
patient_dict.update(date)

return pd.DataFrame(patient_dict)
2 changes: 1 addition & 1 deletion cli/deduplifhirLib/splink_settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"blocking_rules_to_generate_predictions": [
"birth_date",
["ssn", "birth_date"],
["ssn", "street_address"],
["ssn", "street_address0"],
"phone"
],
"max_iterations": 20,
Expand Down
11 changes: 5 additions & 6 deletions cli/deduplifhirLib/tests/main_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
"""
return CliRunner()

def test_dedupe_data_with_csv_output(cli_runner):

Check warning on line 42 in cli/deduplifhirLib/tests/main_test.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0621: Redefining name 'cli_runner' from outer scope (line 36) (redefined-outer-name) Raw Output: cli/deduplifhirLib/tests/main_test.py:42:37: W0621: Redefining name 'cli_runner' from outer scope (line 36) (redefined-outer-name)
"""
Test dedupe_data function with CSV output format.
"""
Expand All @@ -61,12 +61,12 @@
# Clean up: delete output file
os.remove(output_path)

def test_dedupe_data_with_specific_csv(cli_runner):

Check warning on line 64 in cli/deduplifhirLib/tests/main_test.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0621: Redefining name 'cli_runner' from outer scope (line 36) (redefined-outer-name) Raw Output: cli/deduplifhirLib/tests/main_test.py:64:39: W0621: Redefining name 'cli_runner' from outer scope (line 36) (redefined-outer-name)
"""
Test dedupe_data function with specific CSV data to verify deduplication.
"""
# Prepare test data
test_data_csv = """id,truth_value,family_name,given_name,gender,birth_date,phone,street_address,city,state,postal_code,SSN
test_data_csv = """id,truth_value,family_name,given_name,gender,birth_date,phone,street_address0,city0,state0,postal_code0,SSN
IsaacMilarky marked this conversation as resolved.
Show resolved Hide resolved
8,9b0b0b7c-e05e-4c89-991d-268eab2483f7,Obrien,Curtis,M,07/02/1996,,300 Amy Corners Suite 735,Rileytown,Alaska,60281,480-21-0833
342,9b0b0b7c-e05e-4c89-991d-268eab2483f7,Orbien,Cutris,M,07/02/1996,,300 Amy oCrenrs Suite 735,Rileytown,Alaska,60281,480-210-833
502,9b0b0b7c-e05e-4c89-991d-268eab2483f7,bOrien,Curtsi,M,07/02/1996,,300 AmyCo rners Suite 735,Rileytown,Alaska,60281,480-21-8033
Expand All @@ -78,8 +78,7 @@
273,04584982-ae7a-44a1-b4f0-e927a8bab0e1,Russlel,Lnidsay,F,02/05/1977,,2110 Kimbelry Vilalges Apt. 639,New David,Wyoming,52082,211-52-6989
311,04584982-ae7a-44a1-b4f0-e927a8bab0e1,Russlel,Lindasy,F,02/05/1977,,2110 Kimbelry Villgaes Apt. 639,New David,Wyoming,52082,211-52-9698
652,04584982-ae7a-44a1-b4f0-e927a8bab0e1,uRssell,Lidnsay,F,02/05/1977,,2110 Kimberly Vlilagse Apt. 639,New David,Wyoming,52082,121-52-6998
726,04584982-ae7a-44a1-b4f0-e927a8bab0e1,uRssell,Lindasy,F,02/05/1977,,2110 Kmiberly Vilalges Apt. 639,New David,Wyoming,52082,2115-2-6
"""
726,04584982-ae7a-44a1-b4f0-e927a8bab0e1,uRssell,Lindasy,F,02/05/1977,,2110 Kmiberly Vilalges Apt. 639,New David,Wyoming,52082,2115-2-6S"""
IsaacMilarky marked this conversation as resolved.
Show resolved Hide resolved

# Write test data to specific.csv
with open('specific.csv', 'w',encoding='utf-8') as f:
Expand All @@ -102,7 +101,7 @@
os.remove('specific.csv')


def test_dedupe_data_with_json_output(cli_runner):

Check warning on line 104 in cli/deduplifhirLib/tests/main_test.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0621: Redefining name 'cli_runner' from outer scope (line 36) (redefined-outer-name) Raw Output: cli/deduplifhirLib/tests/main_test.py:104:38: W0621: Redefining name 'cli_runner' from outer scope (line 36) (redefined-outer-name)
"""
Test dedupe_data function with JSON output format.
"""
Expand All @@ -124,7 +123,7 @@
# Clean up: delete output file
os.remove(output_path)

def test_dedupe_data_with_invalid_format(cli_runner):

Check warning on line 126 in cli/deduplifhirLib/tests/main_test.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0621: Redefining name 'cli_runner' from outer scope (line 36) (redefined-outer-name) Raw Output: cli/deduplifhirLib/tests/main_test.py:126:41: W0621: Redefining name 'cli_runner' from outer scope (line 36) (redefined-outer-name)
"""
Test dedupe_data function with an invalid data format.
"""
Expand All @@ -148,16 +147,16 @@
os.remove(output_path)
os.remove(bad_data_path)

def test_dedupe_accuracy(cli_runner):

Check warning on line 150 in cli/deduplifhirLib/tests/main_test.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0621: Redefining name 'cli_runner' from outer scope (line 36) (redefined-outer-name) Raw Output: cli/deduplifhirLib/tests/main_test.py:150:25: W0621: Redefining name 'cli_runner' from outer scope (line 36) (redefined-outer-name)
"""
Test dedupe_data function for deduplication accuracy using a dataset with known duplicates.
"""
# Prepare test data
test_data_csv = """id,truth_value,family_name,given_name,gender,birth_date,phone,street_address,city,state,postal_code,SSN
test_data_csv = """id,truth_value,family_name,given_name,gender,birth_date,phone,street_address0,city0,state0,postal_code0,SSN
IsaacMilarky marked this conversation as resolved.
Show resolved Hide resolved
1,duplicate,Smith,John,M,01/01/1990,,123 Elm St,Springfield,IL,62701,123-45-6789
2,duplicate,Smyth,John,M,01/01/1990,,123 Elm St.,Springfield,IL,62701,123-45-6789
3,unique,Doe,Jane,F,02/02/1992,,456 Oak St,Springfield,IL,62702,987-65-4321
"""
3,unique,Doe,Jane,F,02/02/1992,,456 Oak St,Springfield,IL,62702,987-65-4321"""

with open('accuracy.csv', 'w',encoding='utf-8') as f:
f.write(test_data_csv)

Expand All @@ -180,7 +179,7 @@
5000,
2500
],indirect=True)
def test_dedupe_data_with_large_dataset(generate_mock_data_fixture, cli_runner):

Check warning on line 182 in cli/deduplifhirLib/tests/main_test.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0621: Redefining name 'generate_mock_data_fixture' from outer scope (line 18) (redefined-outer-name) Raw Output: cli/deduplifhirLib/tests/main_test.py:182:40: W0621: Redefining name 'generate_mock_data_fixture' from outer scope (line 18) (redefined-outer-name)

Check warning on line 182 in cli/deduplifhirLib/tests/main_test.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0621: Redefining name 'cli_runner' from outer scope (line 36) (redefined-outer-name) Raw Output: cli/deduplifhirLib/tests/main_test.py:182:68: W0621: Redefining name 'cli_runner' from outer scope (line 36) (redefined-outer-name)
"""
Test dedupe_data function with a large dataset.
"""
Expand Down
Loading