Skip to content
This repository has been archived by the owner on Jan 30, 2024. It is now read-only.

Commit

Permalink
make itemization loader use iterator instead of a giant file
Browse files Browse the repository at this point in the history
  • Loading branch information
rshorey committed Apr 3, 2019
1 parent 95a8653 commit c7b0b89
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 13 deletions.
42 changes: 30 additions & 12 deletions cycle_2020/utils/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ def load_filing(filing, filename, filing_fieldnames):

#filing does not exist or it failed previously
try:
filing_dict = process_filing.process_electronic_filing(filename)
filing_dict = process_filing.process_electronic_filing(filename, dump_full=False)
except Exception as e:
logging.log(title="fec2json failed",
text="fec2json failed {} {}".format(filing, e),
Expand Down Expand Up @@ -591,17 +591,35 @@ def load_filing(filing, filename, filing_fieldnames):
schb_count = 0
sche_count = 0
if 'itemizations' in filing_dict:
if 'SchA' in filing_dict['itemizations']:
scha_count = load_itemizations(ScheduleA, filing_dict['itemizations']['SchA'])
if 'SchB' in filing_dict['itemizations']:
schb_count = load_itemizations(ScheduleB, filing_dict['itemizations']['SchB'])
if 'SchE' in filing_dict['itemizations']:
sche_count = load_itemizations(ScheduleE, filing_dict['itemizations']['SchE'])
if 'F57' in filing_dict['itemizations']:
sche_count += load_itemizations(ScheduleE, filing_dict['itemizations']['F57'])
sys.stdout.write("inserted {} schedule A's\n".format(scha_count))
sys.stdout.write("inserted {} schedule B's\n".format(schb_count))
sys.stdout.write("inserted {} schedule E's\n".format(sche_count))
load_chunk_size = 20000
i = 0
complete = False
while not complete:
itemization_dict = {}
while i < load_chunk_size:
try:
line = next(filing_dict['itemizations'])
except StopIteration:
print("stopping")
complete = True
break
#print(line)
itemization_type = process_filing.get_itemization_type(line.get('form_type'))
if itemization_type not in itemization_dict:
itemization_dict[itemization_type] = []
itemization_dict[itemization_type].append(line)

if 'SchA' in itemization_dict:
scha_count = load_itemizations(ScheduleA, itemization_dict['SchA'])
if 'SchB' in itemization_dict:
schb_count = load_itemizations(ScheduleB, itemization_dict['SchB'])
if 'SchE' in itemization_dict:
sche_count = load_itemizations(ScheduleE, itemization_dict['SchE'])
if 'F57' in itemization_dict:
sche_count += load_itemizations(ScheduleE, itemization_dict['F57'])
sys.stdout.write("inserted {} schedule A's\n".format(scha_count))
sys.stdout.write("inserted {} schedule B's\n".format(schb_count))
sys.stdout.write("inserted {} schedule E's\n".format(sche_count))

except:
#something failed in the transaction loading, keep the filing as failed
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ django-storages
djangorestframework
django-pure-pagination
django-localflavor
-e git+https://github.com/newsdev/fec2json@4655e710a3c74b7991fb963becbcdc71f65ad560#egg=fec2json
-e git+https://github.com/newsdev/fec2json@5bf66570c248e994086c256b9abc49abc16f2b8c#egg=fec2json
lxml==4.2.1
requests

0 comments on commit c7b0b89

Please sign in to comment.