Skip to content

Commit

Permalink
#126 added tlo legislator id import services
Browse files Browse the repository at this point in the history
  • Loading branch information
lazarus1331 committed Sep 23, 2020
1 parent 79c0ecf commit d3f3511
Show file tree
Hide file tree
Showing 10 changed files with 361 additions and 16 deletions.
4 changes: 3 additions & 1 deletion src/config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
'influencetx.legislators.apps.LegislatorsConfig',
'influencetx.openstates.apps.OpenstatesConfig',
'influencetx.tpj.apps.TPJConfig',
"influencetx.finances.apps.FinancesConfig",
'influencetx.finances.apps.FinancesConfig',
'influencetx.tlo.apps.TloConfig',
]

# See: https://docs.djangoproject.com/en/dev/ref/settings/#installed-apps
Expand Down Expand Up @@ -332,3 +333,4 @@
# ------------------------------------------------------------------------------
GOOGLE_API_KEY = env('GOOGLE_API_KEY', default='')
GOOGLE_ANALYTICS = env('GOOGLE_ANALYTICS', default='')
TLO_SESSION = env('SESSION', default='86')
5 changes: 5 additions & 0 deletions src/influencetx/tlo/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from django.apps import AppConfig


class TloConfig(AppConfig):
name = 'influencetx.tlo'
57 changes: 57 additions & 0 deletions src/influencetx/tlo/fetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# import json
import os
import logging
import lxml.html
import re
import requests
from django.conf import settings
from time import sleep
LOG = logging.getLogger(__name__)


def lxmlize(url, session=requests.Session()):
"""Parses document into an LXML object and makes links absolute.
Args:
url (str): URL of the document to parse.
Returns:
Element: Document node representing the page.
"""
try:
response = session.get(url, timeout=10)
except requests.exceptions.SSLError:
print('`lxmlize()` failed due to SSL error, trying '
'an unverified `requests.get()`')
response = session.get(url, verify=False, timeout=10)
except requests.exceptions.ConnectionError:
print('Request limit exceeded. Waiting 10 seconds.')
response = session.get(url, timeout=10)
page = lxml.html.fromstring(response.text)
page.make_links_absolute(url)
response.close()
return page


def get_legislator_ids(session, chamber):
"""
Return a list of tlo legislator id & names
"""
print(f"Getting {chamber} legislator ids for session {session}")
chamber_map = {
'Senate': 'S',
'House': 'H',
}
url = f"https://capitol.texas.gov/Members/Members.aspx?Chamber={chamber_map[chamber]}"
page = lxmlize(url)
# table id="dataListMembers"
hrefs = page.xpath('//table[@id="dataListMembers"]//@href')
#LOG.warn(hrefs)
id_map = []
for ref in hrefs:
m = re.search(r'(?<=Code=)[A-Z0-9]+$', ref)
id = m.group(0)
#LOG.warn(f'Found ID {id} in {chamber}')
name = page.xpath(f'//table[@id="dataListMembers"]//a[contains(@href, "{id}")]/text()')[0].strip()
#LOG.info(f'Found name {name} for {id} in {chamber}')
data = {'id': f'{id}', 'name': f'{name}', 'url': f'{ref}'}
id_map.append(data)
return {f'{session}': {f'{chamber}': list(id_map)}}
File renamed without changes.
68 changes: 68 additions & 0 deletions src/influencetx/tlo/management/commands/sync_tx_lege_ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
Django admin command wrapper around `sync_legislator_id` in `influencetx.tlo.services`.
"""
from django.core.management.base import BaseCommand
from django.conf import settings
from influencetx.tlo import fetch, services


class Command(BaseCommand):

help = 'Sync legislator ids from TLO'

def add_arguments(self, parser):
parser.add_argument('--max', default=200, type=int,
help='Max number of legislators to sync. Mainly used for testing.' +
'Default is 200.')
parser.add_argument('--session', type=int, default=None,
help='Pull data for specified session. Defaults to settings.')
parser.add_argument('--chamber', type=str, default=None,
help='Chamber to sync legislators from. Default is both.')

def handle(self, *args, **options):
total_action = 0
if not options['session']:
options['session'] = settings.TLO_SESSION
if not options['chamber']:
chamber_list = ['House', 'Senate']
for chamber in chamber_list:
options['chamber'] = chamber
data = self._fetch_legislators(options)
legislator_list = data[options['session']][options['chamber']]
#self.stdout.write(f'{legislator_list}')
if not legislator_list:
self.stdout.write(self.style.SUCCESS('No data to sync'))
return
for record in legislator_list:
#self.stdout.write(f'Processing record: {record}')
info = services.sync_legislator_id(record, options['session'], options['chamber'])
self._write_info(info)
total_action += 1
else:
data = self._fetch_legislators(options)
legislator_list = data[options['session']][options['chamber']]
#self.stdout.write(f'{legislator_list}')
if not legislator_list:
self.stdout.write(self.style.SUCCESS('No data to sync'))
return
for record in legislator_list:
#self.stdout.write(f'Processing record: {record}')
info = services.sync_legislator_id(record, options['session'], options['chamber'])
self._write_info(info)
total_action += 1

self.stdout.write(self.style.SUCCESS(f'Successfully synced {total_action} legislator ids'))

def _write_info(self, info):
if info.action == services.Action.FAILED:
action = self.style.NOTICE(info.action)
self.stdout.write(f'{action}: {info.error}')
#raise Exception(f"Write failed with {action}: {info.error}")
else:
action = self.style.SUCCESS(info.action)
legislator = info.instance
self.stdout.write(f'{action}: {legislator} ({legislator.tx_lege_id})')

def _fetch_legislators(self, options):
"""Return list of legislator data from TLO."""
return fetch.get_legislator_ids(options['session'], options['chamber'])
15 changes: 0 additions & 15 deletions src/influencetx/tlo/management/sync_bill_witnesses_from_tlo.py

This file was deleted.

File renamed without changes.
158 changes: 158 additions & 0 deletions src/influencetx/tlo/scrapper/tlo-vote-tally.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# from https://github.com/lazarus1331/tlo-vote-tally/blob/master/tlo-vote-tally.py
import argparse
import csv
import lxml.html
import re
import requests
from time import sleep

def lxmlize(url, session=requests.Session()):
"""Parses document into an LXML object and makes links absolute.
Args:
url (str): URL of the document to parse.
Returns:
Element: Document node representing the page.
"""
try:
response = session.get(url, timeout=10)
except requests.exceptions.SSLError:
print('`lxmlize()` failed due to SSL error, trying '
'an unverified `requests.get()`')
response = session.get(url, verify=False, timeout=10)
except requests.exceptions.ConnectionError:
print('Request limit exceeded. Waiting 10 seconds.')
response = session.get(url, timeout=10)
page = lxml.html.fromstring(response.text)
page.make_links_absolute(url)
response.close()
return page

def get_chamber_bills(chamber, session='85R'):
"""
Return a list of tlo urls for each bill detected.
"""
print(f"Getting {chamber} bills")
chamber_map = {
'senate': 'senatefiled',
'house': 'housefiled',
}
url = f"https://capitol.texas.gov/Reports/Report.aspx?LegSess={session}&ID={chamber_map[chamber]}"
page = lxmlize(url)
# the only links on the page are to tlo urls
hrefs = page.xpath('//@href')
links = []
for url in hrefs:
if re.match(r'.*=[SH]B\d+$', url):
links.append(url)
print(f'Found {len(links)} {chamber} bills')
return links

def scrape_chamber(chamber, bill_list):
"""
Return a list dictionary objects for each bill's vote record counts.
Also includes links to journals used as original source, and date of vote.
"""
#bill_list = bill_list[-300:0]
bill_votes = []
print(f'Scraping bill urls for chamber {chamber}')
s = requests.Session()
for url in bill_list:
#print(url)
bill = re.search(r'[SH]B\d+$', url).group()
page = lxmlize(url, s)
house_vote_records = page.xpath('//table/tr[@id="houvote"]')
senate_vote_records = page.xpath('//table/tr[@id="senvote"]')
num_house_votes = len(house_vote_records)
num_senate_votes = len(senate_vote_records)
h_data, s_data = [], []
for record in house_vote_records:
journal_link = record.xpath('./td[2]/a/@href')[0]
type = record.xpath('./td[2]/a/text()')[0]
date = record.xpath('./td[4]/text()')[0].strip()
h_data.append({'date': date, 'source': journal_link, 'type': type})
for record in senate_vote_records:
journal_link = record.xpath('./td[2]/a/@href')[0]
type = record.xpath('./td[2]/a/text()')[0]
date = record.xpath('./td[4]/text()')[0].strip()
s_data.append({'date': date, 'source': journal_link, 'type': type})
data_row = {f'{bill}': {
'lower_votes': num_house_votes,
'upper_votes': num_senate_votes,
'lower': h_data,
'upper': s_data
}}
#bill_votes.append(data_row)
yield data_row
#return bill_votes

def write_data(data, file='results.csv', full=False):
with open(file,'a') as myfile:
writer = csv.writer(myfile, delimiter=',', quotechar='"')
if full:
headers = ['bill_id', 'chamber', 'date', 'journal']
writer.writerow(headers)
for row in data:
for k, v in row.items():
bill_id = k
#print(bill_id)
for record in v['lower']:
full_row = [bill_id, 'house', record['date'],
record['source']]
writer.writerow(full_row)
myfile.flush()
for record in v['upper']:
full_row = [bill_id, 'senate', record['date'],
record['source']]
writer.writerow(full_row)
myfile.flush()
else:
headers = ['bill_id', 'lower_votes', 'upper_votes']
writer.writerow(headers)
for row in data:
for k, v in row.items():
#print(k)
short_row = [k, v['lower_votes'], v['upper_votes']]
writer.writerow(short_row)
myfile.flush()

def main():
parser = argparse.ArgumentParser()
parser.add_argument("-c","--chamber", help="Select between house|senate",
choices=['house', 'senate'])
parser.add_argument("-f","--file", help="Output to this file",
type=str, default='results.csv')
parser.add_argument("-m","--max", help="Maximum number of bills to process",
type=int, default=None)
parser.add_argument("-o","--output", help="Output per vote data",
action="store_true", default=False)
parser.add_argument("-s","--session", help="Enter the Session ID, e.g. '85R'",
type=str, default='85R')
args = parser.parse_args()
print('Starting...')
if args.chamber:
bill_urls = get_chamber_bills(args.chamber, args.session)
if args.max > 0:
bill_urls = get_chamber_bills(args.chamber, args.session)[0:args.max]
write_data(scrape_chamber(args.chamber, bill_urls), args.file,
args.output)
else:
if args.max:
last = args.max//2
hbill_urls = get_chamber_bills('house', args.session)[0:last]
else:
hbill_urls = get_chamber_bills('house', args.session)
write_data(scrape_chamber('house', hbill_urls), args.file,
args.output)
if args.max:
last = args.max//2
sbill_urls = get_chamber_bills('senate', args.session)[0:last]
else:
sbill_urls = get_chamber_bills('senate', args.session)
write_data(scrape_chamber('senate', sbill_urls), args.file,
args.output)
print('Finished.')

# ----------------------------------------------
if __name__ == "__main__":
# execute only if run as a script
main()
68 changes: 68 additions & 0 deletions src/influencetx/tlo/services.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
Application services for TLO.
"""
from collections import namedtuple
from enum import Enum
from django.core.exceptions import ValidationError
from influencetx.legislators import models
from influencetx.tlo import fetch
import logging
LOG = logging.getLogger(__name__)


class Action(Enum):
ADDED = 'Added'
FAILED = 'Failed'
UPDATED = 'Updated'
SKIPPED = 'Skipped'


class ActionInfo(namedtuple('ActionInfo', ['action', 'instance', 'error'])):

@classmethod
def update(cls, action, instance):
return cls(action, instance, error=None)

@classmethod
def fail(cls, error):
return cls(action=Action.FAILED, instance=None, error=error)


def sync_legislator_id(json_data, session, chamber, commit=True):
"""Add legislator id from TLO.
Args:
json_data (dict): Legislator data from TLO.
(Example: {'A2100': {'name': 'Allen', 'url'...}})
commit (bool): Save to the database.
Returns:
info (ActionInfo): Action performed and legislator instance.
"""
LOG.debug(f'Processing data: {json_data}')
id = json_data['id']
name = json_data['name']
if ', ' in name:
# Duplicate last name in chamber
name_list = name.split(', ')
last_name = name_list[0]
first_name = name_list[1]
match = models.Legislator.objects.filter(chamber=chamber).filter(last_name=last_name).filter(name__icontains=first_name)
else:
match = models.Legislator.objects.filter(chamber=chamber).filter(last_name=name)

if match.exists():
#LOG.info(f'Updating legislator {match[0]}')
return add_legislator_id(match[0], id, commit)
else:
msg = f'Failed to find legislator {name} in chamber {chamber}'
return ActionInfo.fail(msg)


def add_legislator_id(instance, id, commit):
if instance.tx_lege_id == id:
return ActionInfo.update(Action.SKIPPED, instance)
else:
instance.tx_lege_id = id
if commit:
instance.save()
return ActionInfo.update(Action.ADDED, instance)
2 changes: 2 additions & 0 deletions src/requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,5 @@ requests-cache==0.4.13

# Web scraper requirements
bs4==0.0.1
lxml==4.5.2
argparse==1.4.0

0 comments on commit d3f3511

Please sign in to comment.