-
Notifications
You must be signed in to change notification settings - Fork 11
/
parse_filings.py
94 lines (76 loc) · 2.61 KB
/
parse_filings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
Module to get data for all cases between two dates
To perform a scraper run, use: python parse_filings.py afterdate beforedate
(dates in format mm-dd-yyyy)
"""
import datetime
import os
import sys
import json
from typing import List, Dict, Optional
import click
import scrapers
from parse_hearings import parse_all_from_parse_filings, persist_parsed_cases
import logging
logger = logging.getLogger()
logging.basicConfig(stream=sys.stdout)
def parse_filings_on_cloud(
afterdate: datetime.date,
beforedate: datetime.date,
get_old_active=True,
showbrowser=False,
scraper: Optional[scrapers.FakeScraper] = None,
):
"""Parses filings without command line interface and outfile options."""
logger.info(f"Parsing filings between {afterdate} and {beforedate}.")
if not scraper:
scraper = scrapers.TravisScraper(headless=not showbrowser)
all_case_nums = scraper.get_all_case_nums(
afterdate=afterdate, beforedate=beforedate
)
if get_old_active:
from persist import get_old_active_case_nums
all_case_nums += get_old_active_case_nums()
# using dict to eliminate duplicates
all_case_nums = list(dict.fromkeys(all_case_nums))
logger.info(f"Found {len(all_case_nums)} case numbers (including old active ones).")
cases = parse_all_from_parse_filings(all_case_nums, scraper=scraper)
# persist cases only if not using the test scraper
if isinstance(scraper, scrapers.TravisScraper):
persist_parsed_cases(cases)
return cases
@click.command()
@click.argument(
"afterdate",
type=click.DateTime(formats=["%Y-%m-%d", "%m-%d-%Y", "%m/%d/%Y"]),
nargs=1,
)
@click.argument(
"beforedate",
type=click.DateTime(formats=["%Y-%m-%d", "%m-%d-%Y", "%m/%d/%Y"]),
nargs=1,
)
@click.argument("outfile", type=click.File(mode="w"), required=False)
@click.option(
"--showbrowser / --headless",
default=False,
help="whether to operate in headless mode or not",
)
def parse_filings(
afterdate: datetime.date, beforedate: datetime.date, outfile, showbrowser=False
):
"""
Perform a full 'scraper run' between `afterdate` and `beforedate`.
Gets case details, events, and dispositions for all case numbers between
`afterdate` and `beforedate`.
Example of date format: 9-1-2020.
Also updates rows in event/disposition/case_detail table that are still active.
"""
parsed_cases = parse_filings_on_cloud(
afterdate=afterdate, beforedate=beforedate, showbrowser=showbrowser
)
if outfile:
json.dump(parsed_cases, outfile)
return parsed_cases
if __name__ == "__main__":
parse_filings()