Skip to content

Commit

Permalink
Merge pull request #111 from DSACMS/dev
Browse files Browse the repository at this point in the history
Merge Dev into Main
  • Loading branch information
IsaacMilarky authored Aug 2, 2024
2 parents 31ae527 + 7e29fb2 commit b879d83
Show file tree
Hide file tree
Showing 7 changed files with 417 additions and 347 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,14 @@ the American public, but you are also welcome to submit anonymously.

For more information about our Security, Vulnerability, and Responsible Disclosure Policies, see [SECURITY.md](SECURITY.md).

### Software Bill of Materials (SBOM)

A Software Bill of Materials (SBOM) is a formal record containing the details and supply chain relationships of various components used in building software.

In the spirit of [Executive Order 14028 - Improving the Nation’s Cyber Security](https://www.gsa.gov/technology/it-contract-vehicles-and-purchasing-programs/information-technology-category/it-security/executive-order-14028), a SBOM for this repository is provided here: https://github.com/DSACMS/dedupliFHIR/network/dependencies.

For more information and resources about SBOMs, visit: https://www.cisa.gov/sbom.

## Public domain

This project is in the public domain within the United States, and copyright and related rights in the work worldwide are waived through the [CC0 1.0 Universal public domain dedication](https://creativecommons.org/publicdomain/zero/1.0/) as indicated in our [LICENSE](LICENSE).
Expand Down
10 changes: 9 additions & 1 deletion cli/deduplifhirLib/normalization.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
from dateutil import parser as date_parser
from dateutil.parser import ParserError
from text_to_num import alpha2digit

NAME_ABBREVIATION_SYMBOLS = {
' jr ': 'junior',
Expand Down Expand Up @@ -307,6 +308,10 @@ def normalize_addr_text(input_text):
"""
text_copy = input_text
#text_copy = british_to_american(text_copy) not needed
try:
text_copy = alpha2digit(text_copy,"en")
except ValueError:
...
text_copy = remove_non_alphanum(text_copy)
print(text_copy)
text_copy = replace_abbreviations(text_copy.lower())
Expand All @@ -318,9 +323,12 @@ def normalize_addr_text(input_text):
NAME_TEXT = "Greene,Jacquleine"
print(normalize_name_text(NAME_TEXT))

PLACE_TEXT = "7805 Kartina Motorawy Apt. 313,Taylorstad,New Hampshire"
PLACE_TEXT = "7805 Kartina Motorawy Apt. three hundred thirteen ,Taylorstad,New Hampshire"

print(normalize_addr_text(PLACE_TEXT))

DATE_TEXT = "December 10, 1999"
print(normalize_date_text(DATE_TEXT))

NUM_TEXT = "I have one hundred twenty three apples and forty-five oranges. Valetnine"
print(alpha2digit(NUM_TEXT,'en'))
61 changes: 42 additions & 19 deletions cli/deduplifhirLib/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,35 +15,58 @@
import json
import uuid
import pandas as pd
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl
from splink.duckdb.blocking_rule_library import block_on
import splink.comparison_library as cl
from splink import SettingsCreator, block_on
from deduplifhirLib.normalization import (
normalize_addr_text, normalize_name_text, normalize_date_text
)


dir_path = os.path.dirname(os.path.realpath(__file__))
with open(dir_path + '/splink_settings.json',"r",encoding="utf-8") as f:
SPLINK_LINKER_SETTINGS_PATIENT_DEDUPE = json.load(f)
splink_settings_dict = json.load(f)


SPLINK_LINKER_SETTINGS_PATIENT_DEDUPE.update({
"comparisons": [
ctl.name_comparison("given_name", term_frequency_adjustments=True),
ctl.name_comparison("family_name", term_frequency_adjustments=True),
ctl.date_comparison("birth_date", cast_strings_to_date=True, invalid_dates_as_null=True),
ctl.postcode_comparison("postal_code"),
cl.exact_match("street_address", term_frequency_adjustments=True),
cl.exact_match("phone", term_frequency_adjustments=True),
]
})

#apply blocking function to translate into sql rules
blocking_rules = SPLINK_LINKER_SETTINGS_PATIENT_DEDUPE["blocking_rules_to_generate_predictions"]
BLOCKING_RULE_STRINGS = blocking_rules
SPLINK_LINKER_SETTINGS_PATIENT_DEDUPE["blocking_rules_to_generate_predictions"] = list(
map(block_on,blocking_rules))
BLOCKING_RULE_STRINGS = splink_settings_dict["blocking_rules_to_generate_predictions"]
#blocking_rules = list(
# map(block_on,blocking_rules))

blocking_rules = []
for rule in BLOCKING_RULE_STRINGS:
if isinstance(rule, list):
blocking_rules.append(block_on(*rule))
else:
blocking_rules.append(block_on(rule))


comparison_rules = [
cl.ExactMatch("street_address").configure(
term_frequency_adjustments=True
),
cl.ExactMatch("phone").configure(
term_frequency_adjustments=True
),
cl.NameComparison("given_name").configure(
term_frequency_adjustments=True
),
cl.NameComparison("family_name").configure(
term_frequency_adjustments=True
),
cl.DateOfBirthComparison("birth_date",input_is_string=True),
cl.PostcodeComparison("postal_code")
]


SPLINK_LINKER_SETTINGS_PATIENT_DEDUPE = SettingsCreator(
link_type=splink_settings_dict["link_type"],
blocking_rules_to_generate_predictions=blocking_rules,
comparisons=comparison_rules,
max_iterations=splink_settings_dict["max_iterations"],
em_convergence=splink_settings_dict["em_convergence"])





#NOTE: The only reason this function is defined outside utils.py is because of a known bug with
Expand Down
8 changes: 5 additions & 3 deletions cli/deduplifhirLib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from multiprocessing import Pool
from functools import wraps
import pandas as pd
from splink.duckdb.linker import DuckDBLinker
#from splink.duckdb.linker import DuckDBLinker
from splink import DuckDBAPI, Linker

from deduplifhirLib.settings import (
SPLINK_LINKER_SETTINGS_PATIENT_DEDUPE, BLOCKING_RULE_STRINGS, read_fhir_data
Expand Down Expand Up @@ -177,8 +178,9 @@ def wrapper(*args,**kwargs):
print(f"Could not assert the proper number of unique records for rule {rule}")
raise e

lnkr = DuckDBLinker(train_frame, SPLINK_LINKER_SETTINGS_PATIENT_DEDUPE)
lnkr.estimate_u_using_random_sampling(max_pairs=5e6)
#lnkr = DuckDBLinker(train_frame, SPLINK_LINKER_SETTINGS_PATIENT_DEDUPE)
lnkr = Linker(train_frame,SPLINK_LINKER_SETTINGS_PATIENT_DEDUPE,db_api=DuckDBAPI())
lnkr.training.estimate_u_using_random_sampling(max_pairs=5e6)

kwargs['linker'] = lnkr
return func(*args,**kwargs)
Expand Down
17 changes: 9 additions & 8 deletions cli/ecqm_dedupe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
"""
import os
import os.path
import difflib
import pandas as pd
import click
from splink.duckdb.blocking_rule_library import block_on
from splink import block_on
from deduplifhirLib.utils import use_linker


Expand All @@ -29,21 +28,23 @@ def dedupe_data(fmt,bad_data_path, output_path,linker=None): #pylint: disable=un
print(os.getcwd())
#linker is created by use_linker decorator
blocking_rule_for_training = block_on("ssn")
linker.estimate_parameters_using_expectation_maximisation(
linker.training.estimate_parameters_using_expectation_maximisation(
blocking_rule_for_training)

blocking_rule_for_training = block_on("birth_date") # block on year
linker.estimate_parameters_using_expectation_maximisation(
linker.training.estimate_parameters_using_expectation_maximisation(
blocking_rule_for_training)

blocking_rule_for_training = block_on(["street_address", "postal_code"])
linker.estimate_parameters_using_expectation_maximisation(
blocking_rule_for_training = block_on("street_address", "postal_code")
linker.training.estimate_parameters_using_expectation_maximisation(
blocking_rule_for_training)


pairwise_predictions = linker.predict()
pairwise_predictions = linker.inference.predict()

clusters = linker.cluster_pairwise_predictions_at_threshold(pairwise_predictions, 0.95)
clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
pairwise_predictions, 0.95
)

deduped_record_mapping = clusters.as_pandas_dataframe()

Expand Down
Loading

0 comments on commit b879d83

Please sign in to comment.