From d3f351103e1080be221a6706153a846043a7ef93 Mon Sep 17 00:00:00 2001 From: Michael Rollins Date: Tue, 22 Sep 2020 21:36:11 -0500 Subject: [PATCH] #126 added tlo legislator id import services --- src/config/settings/base.py | 4 +- src/influencetx/tlo/apps.py | 5 + src/influencetx/tlo/fetch.py | 57 +++++++ .../tlo/management/{ => commands}/__init__.py | 0 .../management/commands/sync_tx_lege_ids.py | 68 ++++++++ .../sync_bill_witnesses_from_tlo.py | 15 -- src/influencetx/tlo/{ => scrapper}/demo.py | 0 .../tlo/scrapper/tlo-vote-tally.py | 158 ++++++++++++++++++ src/influencetx/tlo/services.py | 68 ++++++++ src/requirements/base.txt | 2 + 10 files changed, 361 insertions(+), 16 deletions(-) create mode 100644 src/influencetx/tlo/apps.py create mode 100644 src/influencetx/tlo/fetch.py rename src/influencetx/tlo/management/{ => commands}/__init__.py (100%) create mode 100644 src/influencetx/tlo/management/commands/sync_tx_lege_ids.py delete mode 100644 src/influencetx/tlo/management/sync_bill_witnesses_from_tlo.py rename src/influencetx/tlo/{ => scrapper}/demo.py (100%) create mode 100644 src/influencetx/tlo/scrapper/tlo-vote-tally.py create mode 100644 src/influencetx/tlo/services.py diff --git a/src/config/settings/base.py b/src/config/settings/base.py index 03186e6..572d82a 100644 --- a/src/config/settings/base.py +++ b/src/config/settings/base.py @@ -61,7 +61,8 @@ 'influencetx.legislators.apps.LegislatorsConfig', 'influencetx.openstates.apps.OpenstatesConfig', 'influencetx.tpj.apps.TPJConfig', - "influencetx.finances.apps.FinancesConfig", + 'influencetx.finances.apps.FinancesConfig', + 'influencetx.tlo.apps.TloConfig', ] # See: https://docs.djangoproject.com/en/dev/ref/settings/#installed-apps @@ -332,3 +333,4 @@ # ------------------------------------------------------------------------------ GOOGLE_API_KEY = env('GOOGLE_API_KEY', default='') GOOGLE_ANALYTICS = env('GOOGLE_ANALYTICS', default='') +TLO_SESSION = env('SESSION', default='86') diff --git a/src/influencetx/tlo/apps.py b/src/influencetx/tlo/apps.py new file mode 100644 index 0000000..57b475f --- /dev/null +++ b/src/influencetx/tlo/apps.py @@ -0,0 +1,5 @@ +from django.apps import AppConfig + + +class TloConfig(AppConfig): + name = 'influencetx.tlo' diff --git a/src/influencetx/tlo/fetch.py b/src/influencetx/tlo/fetch.py new file mode 100644 index 0000000..7d89725 --- /dev/null +++ b/src/influencetx/tlo/fetch.py @@ -0,0 +1,57 @@ +# import json +import os +import logging +import lxml.html +import re +import requests +from django.conf import settings +from time import sleep +LOG = logging.getLogger(__name__) + + +def lxmlize(url, session=requests.Session()): + """Parses document into an LXML object and makes links absolute. + Args: + url (str): URL of the document to parse. + Returns: + Element: Document node representing the page. + """ + try: + response = session.get(url, timeout=10) + except requests.exceptions.SSLError: + print('`lxmlize()` failed due to SSL error, trying ' + 'an unverified `requests.get()`') + response = session.get(url, verify=False, timeout=10) + except requests.exceptions.ConnectionError: + print('Request limit exceeded. Waiting 10 seconds.') + response = session.get(url, timeout=10) + page = lxml.html.fromstring(response.text) + page.make_links_absolute(url) + response.close() + return page + + +def get_legislator_ids(session, chamber): + """ + Return a list of tlo legislator id & names + """ + print(f"Getting {chamber} legislator ids for session {session}") + chamber_map = { + 'Senate': 'S', + 'House': 'H', + } + url = f"https://capitol.texas.gov/Members/Members.aspx?Chamber={chamber_map[chamber]}" + page = lxmlize(url) + # table id="dataListMembers" + hrefs = page.xpath('//table[@id="dataListMembers"]//@href') + #LOG.warn(hrefs) + id_map = [] + for ref in hrefs: + m = re.search(r'(?<=Code=)[A-Z0-9]+$', ref) + id = m.group(0) + #LOG.warn(f'Found ID {id} in {chamber}') + name = page.xpath(f'//table[@id="dataListMembers"]//a[contains(@href, "{id}")]/text()')[0].strip() + #LOG.info(f'Found name {name} for {id} in {chamber}') + data = {'id': f'{id}', 'name': f'{name}', 'url': f'{ref}'} + id_map.append(data) + return {f'{session}': {f'{chamber}': list(id_map)}} diff --git a/src/influencetx/tlo/management/__init__.py b/src/influencetx/tlo/management/commands/__init__.py similarity index 100% rename from src/influencetx/tlo/management/__init__.py rename to src/influencetx/tlo/management/commands/__init__.py diff --git a/src/influencetx/tlo/management/commands/sync_tx_lege_ids.py b/src/influencetx/tlo/management/commands/sync_tx_lege_ids.py new file mode 100644 index 0000000..2665aaf --- /dev/null +++ b/src/influencetx/tlo/management/commands/sync_tx_lege_ids.py @@ -0,0 +1,68 @@ +""" +Django admin command wrapper around `sync_legislator_id` in `influencetx.tlo.services`. +""" +from django.core.management.base import BaseCommand +from django.conf import settings +from influencetx.tlo import fetch, services + + +class Command(BaseCommand): + + help = 'Sync legislator ids from TLO' + + def add_arguments(self, parser): + parser.add_argument('--max', default=200, type=int, + help='Max number of legislators to sync. Mainly used for testing.' + + 'Default is 200.') + parser.add_argument('--session', type=int, default=None, + help='Pull data for specified session. Defaults to settings.') + parser.add_argument('--chamber', type=str, default=None, + help='Chamber to sync legislators from. Default is both.') + + def handle(self, *args, **options): + total_action = 0 + if not options['session']: + options['session'] = settings.TLO_SESSION + if not options['chamber']: + chamber_list = ['House', 'Senate'] + for chamber in chamber_list: + options['chamber'] = chamber + data = self._fetch_legislators(options) + legislator_list = data[options['session']][options['chamber']] + #self.stdout.write(f'{legislator_list}') + if not legislator_list: + self.stdout.write(self.style.SUCCESS('No data to sync')) + return + for record in legislator_list: + #self.stdout.write(f'Processing record: {record}') + info = services.sync_legislator_id(record, options['session'], options['chamber']) + self._write_info(info) + total_action += 1 + else: + data = self._fetch_legislators(options) + legislator_list = data[options['session']][options['chamber']] + #self.stdout.write(f'{legislator_list}') + if not legislator_list: + self.stdout.write(self.style.SUCCESS('No data to sync')) + return + for record in legislator_list: + #self.stdout.write(f'Processing record: {record}') + info = services.sync_legislator_id(record, options['session'], options['chamber']) + self._write_info(info) + total_action += 1 + + self.stdout.write(self.style.SUCCESS(f'Successfully synced {total_action} legislator ids')) + + def _write_info(self, info): + if info.action == services.Action.FAILED: + action = self.style.NOTICE(info.action) + self.stdout.write(f'{action}: {info.error}') + #raise Exception(f"Write failed with {action}: {info.error}") + else: + action = self.style.SUCCESS(info.action) + legislator = info.instance + self.stdout.write(f'{action}: {legislator} ({legislator.tx_lege_id})') + + def _fetch_legislators(self, options): + """Return list of legislator data from TLO.""" + return fetch.get_legislator_ids(options['session'], options['chamber']) diff --git a/src/influencetx/tlo/management/sync_bill_witnesses_from_tlo.py b/src/influencetx/tlo/management/sync_bill_witnesses_from_tlo.py deleted file mode 100644 index 99b8a66..0000000 --- a/src/influencetx/tlo/management/sync_bill_witnesses_from_tlo.py +++ /dev/null @@ -1,15 +0,0 @@ -""" -Django admin command wrapper around `sync_bill_data` in `influencetx.openstates.services`. -""" -from django.core.management.base import BaseCommand - -from influencetx.openstates import fetch, services - - -class Command(BaseCommand): - - help = 'Sync bill witness data from Texas Legislature Online' - - def handle(self, *args, **options): - bill_id = 123 - print "hello wurld!" diff --git a/src/influencetx/tlo/demo.py b/src/influencetx/tlo/scrapper/demo.py similarity index 100% rename from src/influencetx/tlo/demo.py rename to src/influencetx/tlo/scrapper/demo.py diff --git a/src/influencetx/tlo/scrapper/tlo-vote-tally.py b/src/influencetx/tlo/scrapper/tlo-vote-tally.py new file mode 100644 index 0000000..73119bb --- /dev/null +++ b/src/influencetx/tlo/scrapper/tlo-vote-tally.py @@ -0,0 +1,158 @@ +# from https://github.com/lazarus1331/tlo-vote-tally/blob/master/tlo-vote-tally.py +import argparse +import csv +import lxml.html +import re +import requests +from time import sleep + +def lxmlize(url, session=requests.Session()): + """Parses document into an LXML object and makes links absolute. + Args: + url (str): URL of the document to parse. + Returns: + Element: Document node representing the page. + """ + try: + response = session.get(url, timeout=10) + except requests.exceptions.SSLError: + print('`lxmlize()` failed due to SSL error, trying ' + 'an unverified `requests.get()`') + response = session.get(url, verify=False, timeout=10) + except requests.exceptions.ConnectionError: + print('Request limit exceeded. Waiting 10 seconds.') + response = session.get(url, timeout=10) + page = lxml.html.fromstring(response.text) + page.make_links_absolute(url) + response.close() + return page + +def get_chamber_bills(chamber, session='85R'): + """ + Return a list of tlo urls for each bill detected. + """ + print(f"Getting {chamber} bills") + chamber_map = { + 'senate': 'senatefiled', + 'house': 'housefiled', + } + url = f"https://capitol.texas.gov/Reports/Report.aspx?LegSess={session}&ID={chamber_map[chamber]}" + page = lxmlize(url) + # the only links on the page are to tlo urls + hrefs = page.xpath('//@href') + links = [] + for url in hrefs: + if re.match(r'.*=[SH]B\d+$', url): + links.append(url) + print(f'Found {len(links)} {chamber} bills') + return links + +def scrape_chamber(chamber, bill_list): + """ + Return a list dictionary objects for each bill's vote record counts. + Also includes links to journals used as original source, and date of vote. + """ + #bill_list = bill_list[-300:0] + bill_votes = [] + print(f'Scraping bill urls for chamber {chamber}') + s = requests.Session() + for url in bill_list: + #print(url) + bill = re.search(r'[SH]B\d+$', url).group() + page = lxmlize(url, s) + house_vote_records = page.xpath('//table/tr[@id="houvote"]') + senate_vote_records = page.xpath('//table/tr[@id="senvote"]') + num_house_votes = len(house_vote_records) + num_senate_votes = len(senate_vote_records) + h_data, s_data = [], [] + for record in house_vote_records: + journal_link = record.xpath('./td[2]/a/@href')[0] + type = record.xpath('./td[2]/a/text()')[0] + date = record.xpath('./td[4]/text()')[0].strip() + h_data.append({'date': date, 'source': journal_link, 'type': type}) + for record in senate_vote_records: + journal_link = record.xpath('./td[2]/a/@href')[0] + type = record.xpath('./td[2]/a/text()')[0] + date = record.xpath('./td[4]/text()')[0].strip() + s_data.append({'date': date, 'source': journal_link, 'type': type}) + data_row = {f'{bill}': { + 'lower_votes': num_house_votes, + 'upper_votes': num_senate_votes, + 'lower': h_data, + 'upper': s_data + }} + #bill_votes.append(data_row) + yield data_row + #return bill_votes + +def write_data(data, file='results.csv', full=False): + with open(file,'a') as myfile: + writer = csv.writer(myfile, delimiter=',', quotechar='"') + if full: + headers = ['bill_id', 'chamber', 'date', 'journal'] + writer.writerow(headers) + for row in data: + for k, v in row.items(): + bill_id = k + #print(bill_id) + for record in v['lower']: + full_row = [bill_id, 'house', record['date'], + record['source']] + writer.writerow(full_row) + myfile.flush() + for record in v['upper']: + full_row = [bill_id, 'senate', record['date'], + record['source']] + writer.writerow(full_row) + myfile.flush() + else: + headers = ['bill_id', 'lower_votes', 'upper_votes'] + writer.writerow(headers) + for row in data: + for k, v in row.items(): + #print(k) + short_row = [k, v['lower_votes'], v['upper_votes']] + writer.writerow(short_row) + myfile.flush() + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-c","--chamber", help="Select between house|senate", + choices=['house', 'senate']) + parser.add_argument("-f","--file", help="Output to this file", + type=str, default='results.csv') + parser.add_argument("-m","--max", help="Maximum number of bills to process", + type=int, default=None) + parser.add_argument("-o","--output", help="Output per vote data", + action="store_true", default=False) + parser.add_argument("-s","--session", help="Enter the Session ID, e.g. '85R'", + type=str, default='85R') + args = parser.parse_args() + print('Starting...') + if args.chamber: + bill_urls = get_chamber_bills(args.chamber, args.session) + if args.max > 0: + bill_urls = get_chamber_bills(args.chamber, args.session)[0:args.max] + write_data(scrape_chamber(args.chamber, bill_urls), args.file, + args.output) + else: + if args.max: + last = args.max//2 + hbill_urls = get_chamber_bills('house', args.session)[0:last] + else: + hbill_urls = get_chamber_bills('house', args.session) + write_data(scrape_chamber('house', hbill_urls), args.file, + args.output) + if args.max: + last = args.max//2 + sbill_urls = get_chamber_bills('senate', args.session)[0:last] + else: + sbill_urls = get_chamber_bills('senate', args.session) + write_data(scrape_chamber('senate', sbill_urls), args.file, + args.output) + print('Finished.') + +# ---------------------------------------------- +if __name__ == "__main__": + # execute only if run as a script + main() diff --git a/src/influencetx/tlo/services.py b/src/influencetx/tlo/services.py new file mode 100644 index 0000000..cb9f63b --- /dev/null +++ b/src/influencetx/tlo/services.py @@ -0,0 +1,68 @@ +""" +Application services for TLO. +""" +from collections import namedtuple +from enum import Enum +from django.core.exceptions import ValidationError +from influencetx.legislators import models +from influencetx.tlo import fetch +import logging +LOG = logging.getLogger(__name__) + + +class Action(Enum): + ADDED = 'Added' + FAILED = 'Failed' + UPDATED = 'Updated' + SKIPPED = 'Skipped' + + +class ActionInfo(namedtuple('ActionInfo', ['action', 'instance', 'error'])): + + @classmethod + def update(cls, action, instance): + return cls(action, instance, error=None) + + @classmethod + def fail(cls, error): + return cls(action=Action.FAILED, instance=None, error=error) + + +def sync_legislator_id(json_data, session, chamber, commit=True): + """Add legislator id from TLO. + + Args: + json_data (dict): Legislator data from TLO. + (Example: {'A2100': {'name': 'Allen', 'url'...}}) + commit (bool): Save to the database. + Returns: + info (ActionInfo): Action performed and legislator instance. + """ + LOG.debug(f'Processing data: {json_data}') + id = json_data['id'] + name = json_data['name'] + if ', ' in name: + # Duplicate last name in chamber + name_list = name.split(', ') + last_name = name_list[0] + first_name = name_list[1] + match = models.Legislator.objects.filter(chamber=chamber).filter(last_name=last_name).filter(name__icontains=first_name) + else: + match = models.Legislator.objects.filter(chamber=chamber).filter(last_name=name) + + if match.exists(): + #LOG.info(f'Updating legislator {match[0]}') + return add_legislator_id(match[0], id, commit) + else: + msg = f'Failed to find legislator {name} in chamber {chamber}' + return ActionInfo.fail(msg) + + +def add_legislator_id(instance, id, commit): + if instance.tx_lege_id == id: + return ActionInfo.update(Action.SKIPPED, instance) + else: + instance.tx_lege_id = id + if commit: + instance.save() + return ActionInfo.update(Action.ADDED, instance) diff --git a/src/requirements/base.txt b/src/requirements/base.txt index 2990be8..999265c 100644 --- a/src/requirements/base.txt +++ b/src/requirements/base.txt @@ -52,3 +52,5 @@ requests-cache==0.4.13 # Web scraper requirements bs4==0.0.1 +lxml==4.5.2 +argparse==1.4.0