-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#126 added tlo legislator id import services
- Loading branch information
1 parent
79c0ecf
commit d3f3511
Showing
10 changed files
with
361 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from django.apps import AppConfig | ||
|
||
|
||
class TloConfig(AppConfig): | ||
name = 'influencetx.tlo' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# import json | ||
import os | ||
import logging | ||
import lxml.html | ||
import re | ||
import requests | ||
from django.conf import settings | ||
from time import sleep | ||
LOG = logging.getLogger(__name__) | ||
|
||
|
||
def lxmlize(url, session=requests.Session()): | ||
"""Parses document into an LXML object and makes links absolute. | ||
Args: | ||
url (str): URL of the document to parse. | ||
Returns: | ||
Element: Document node representing the page. | ||
""" | ||
try: | ||
response = session.get(url, timeout=10) | ||
except requests.exceptions.SSLError: | ||
print('`lxmlize()` failed due to SSL error, trying ' | ||
'an unverified `requests.get()`') | ||
response = session.get(url, verify=False, timeout=10) | ||
except requests.exceptions.ConnectionError: | ||
print('Request limit exceeded. Waiting 10 seconds.') | ||
response = session.get(url, timeout=10) | ||
page = lxml.html.fromstring(response.text) | ||
page.make_links_absolute(url) | ||
response.close() | ||
return page | ||
|
||
|
||
def get_legislator_ids(session, chamber): | ||
""" | ||
Return a list of tlo legislator id & names | ||
""" | ||
print(f"Getting {chamber} legislator ids for session {session}") | ||
chamber_map = { | ||
'Senate': 'S', | ||
'House': 'H', | ||
} | ||
url = f"https://capitol.texas.gov/Members/Members.aspx?Chamber={chamber_map[chamber]}" | ||
page = lxmlize(url) | ||
# table id="dataListMembers" | ||
hrefs = page.xpath('//table[@id="dataListMembers"]//@href') | ||
#LOG.warn(hrefs) | ||
id_map = [] | ||
for ref in hrefs: | ||
m = re.search(r'(?<=Code=)[A-Z0-9]+$', ref) | ||
id = m.group(0) | ||
#LOG.warn(f'Found ID {id} in {chamber}') | ||
name = page.xpath(f'//table[@id="dataListMembers"]//a[contains(@href, "{id}")]/text()')[0].strip() | ||
#LOG.info(f'Found name {name} for {id} in {chamber}') | ||
data = {'id': f'{id}', 'name': f'{name}', 'url': f'{ref}'} | ||
id_map.append(data) | ||
return {f'{session}': {f'{chamber}': list(id_map)}} |
File renamed without changes.
68 changes: 68 additions & 0 deletions
68
src/influencetx/tlo/management/commands/sync_tx_lege_ids.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
""" | ||
Django admin command wrapper around `sync_legislator_id` in `influencetx.tlo.services`. | ||
""" | ||
from django.core.management.base import BaseCommand | ||
from django.conf import settings | ||
from influencetx.tlo import fetch, services | ||
|
||
|
||
class Command(BaseCommand): | ||
|
||
help = 'Sync legislator ids from TLO' | ||
|
||
def add_arguments(self, parser): | ||
parser.add_argument('--max', default=200, type=int, | ||
help='Max number of legislators to sync. Mainly used for testing.' + | ||
'Default is 200.') | ||
parser.add_argument('--session', type=int, default=None, | ||
help='Pull data for specified session. Defaults to settings.') | ||
parser.add_argument('--chamber', type=str, default=None, | ||
help='Chamber to sync legislators from. Default is both.') | ||
|
||
def handle(self, *args, **options): | ||
total_action = 0 | ||
if not options['session']: | ||
options['session'] = settings.TLO_SESSION | ||
if not options['chamber']: | ||
chamber_list = ['House', 'Senate'] | ||
for chamber in chamber_list: | ||
options['chamber'] = chamber | ||
data = self._fetch_legislators(options) | ||
legislator_list = data[options['session']][options['chamber']] | ||
#self.stdout.write(f'{legislator_list}') | ||
if not legislator_list: | ||
self.stdout.write(self.style.SUCCESS('No data to sync')) | ||
return | ||
for record in legislator_list: | ||
#self.stdout.write(f'Processing record: {record}') | ||
info = services.sync_legislator_id(record, options['session'], options['chamber']) | ||
self._write_info(info) | ||
total_action += 1 | ||
else: | ||
data = self._fetch_legislators(options) | ||
legislator_list = data[options['session']][options['chamber']] | ||
#self.stdout.write(f'{legislator_list}') | ||
if not legislator_list: | ||
self.stdout.write(self.style.SUCCESS('No data to sync')) | ||
return | ||
for record in legislator_list: | ||
#self.stdout.write(f'Processing record: {record}') | ||
info = services.sync_legislator_id(record, options['session'], options['chamber']) | ||
self._write_info(info) | ||
total_action += 1 | ||
|
||
self.stdout.write(self.style.SUCCESS(f'Successfully synced {total_action} legislator ids')) | ||
|
||
def _write_info(self, info): | ||
if info.action == services.Action.FAILED: | ||
action = self.style.NOTICE(info.action) | ||
self.stdout.write(f'{action}: {info.error}') | ||
#raise Exception(f"Write failed with {action}: {info.error}") | ||
else: | ||
action = self.style.SUCCESS(info.action) | ||
legislator = info.instance | ||
self.stdout.write(f'{action}: {legislator} ({legislator.tx_lege_id})') | ||
|
||
def _fetch_legislators(self, options): | ||
"""Return list of legislator data from TLO.""" | ||
return fetch.get_legislator_ids(options['session'], options['chamber']) |
15 changes: 0 additions & 15 deletions
15
src/influencetx/tlo/management/sync_bill_witnesses_from_tlo.py
This file was deleted.
Oops, something went wrong.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
# from https://github.com/lazarus1331/tlo-vote-tally/blob/master/tlo-vote-tally.py | ||
import argparse | ||
import csv | ||
import lxml.html | ||
import re | ||
import requests | ||
from time import sleep | ||
|
||
def lxmlize(url, session=requests.Session()): | ||
"""Parses document into an LXML object and makes links absolute. | ||
Args: | ||
url (str): URL of the document to parse. | ||
Returns: | ||
Element: Document node representing the page. | ||
""" | ||
try: | ||
response = session.get(url, timeout=10) | ||
except requests.exceptions.SSLError: | ||
print('`lxmlize()` failed due to SSL error, trying ' | ||
'an unverified `requests.get()`') | ||
response = session.get(url, verify=False, timeout=10) | ||
except requests.exceptions.ConnectionError: | ||
print('Request limit exceeded. Waiting 10 seconds.') | ||
response = session.get(url, timeout=10) | ||
page = lxml.html.fromstring(response.text) | ||
page.make_links_absolute(url) | ||
response.close() | ||
return page | ||
|
||
def get_chamber_bills(chamber, session='85R'): | ||
""" | ||
Return a list of tlo urls for each bill detected. | ||
""" | ||
print(f"Getting {chamber} bills") | ||
chamber_map = { | ||
'senate': 'senatefiled', | ||
'house': 'housefiled', | ||
} | ||
url = f"https://capitol.texas.gov/Reports/Report.aspx?LegSess={session}&ID={chamber_map[chamber]}" | ||
page = lxmlize(url) | ||
# the only links on the page are to tlo urls | ||
hrefs = page.xpath('//@href') | ||
links = [] | ||
for url in hrefs: | ||
if re.match(r'.*=[SH]B\d+$', url): | ||
links.append(url) | ||
print(f'Found {len(links)} {chamber} bills') | ||
return links | ||
|
||
def scrape_chamber(chamber, bill_list): | ||
""" | ||
Return a list dictionary objects for each bill's vote record counts. | ||
Also includes links to journals used as original source, and date of vote. | ||
""" | ||
#bill_list = bill_list[-300:0] | ||
bill_votes = [] | ||
print(f'Scraping bill urls for chamber {chamber}') | ||
s = requests.Session() | ||
for url in bill_list: | ||
#print(url) | ||
bill = re.search(r'[SH]B\d+$', url).group() | ||
page = lxmlize(url, s) | ||
house_vote_records = page.xpath('//table/tr[@id="houvote"]') | ||
senate_vote_records = page.xpath('//table/tr[@id="senvote"]') | ||
num_house_votes = len(house_vote_records) | ||
num_senate_votes = len(senate_vote_records) | ||
h_data, s_data = [], [] | ||
for record in house_vote_records: | ||
journal_link = record.xpath('./td[2]/a/@href')[0] | ||
type = record.xpath('./td[2]/a/text()')[0] | ||
date = record.xpath('./td[4]/text()')[0].strip() | ||
h_data.append({'date': date, 'source': journal_link, 'type': type}) | ||
for record in senate_vote_records: | ||
journal_link = record.xpath('./td[2]/a/@href')[0] | ||
type = record.xpath('./td[2]/a/text()')[0] | ||
date = record.xpath('./td[4]/text()')[0].strip() | ||
s_data.append({'date': date, 'source': journal_link, 'type': type}) | ||
data_row = {f'{bill}': { | ||
'lower_votes': num_house_votes, | ||
'upper_votes': num_senate_votes, | ||
'lower': h_data, | ||
'upper': s_data | ||
}} | ||
#bill_votes.append(data_row) | ||
yield data_row | ||
#return bill_votes | ||
|
||
def write_data(data, file='results.csv', full=False): | ||
with open(file,'a') as myfile: | ||
writer = csv.writer(myfile, delimiter=',', quotechar='"') | ||
if full: | ||
headers = ['bill_id', 'chamber', 'date', 'journal'] | ||
writer.writerow(headers) | ||
for row in data: | ||
for k, v in row.items(): | ||
bill_id = k | ||
#print(bill_id) | ||
for record in v['lower']: | ||
full_row = [bill_id, 'house', record['date'], | ||
record['source']] | ||
writer.writerow(full_row) | ||
myfile.flush() | ||
for record in v['upper']: | ||
full_row = [bill_id, 'senate', record['date'], | ||
record['source']] | ||
writer.writerow(full_row) | ||
myfile.flush() | ||
else: | ||
headers = ['bill_id', 'lower_votes', 'upper_votes'] | ||
writer.writerow(headers) | ||
for row in data: | ||
for k, v in row.items(): | ||
#print(k) | ||
short_row = [k, v['lower_votes'], v['upper_votes']] | ||
writer.writerow(short_row) | ||
myfile.flush() | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-c","--chamber", help="Select between house|senate", | ||
choices=['house', 'senate']) | ||
parser.add_argument("-f","--file", help="Output to this file", | ||
type=str, default='results.csv') | ||
parser.add_argument("-m","--max", help="Maximum number of bills to process", | ||
type=int, default=None) | ||
parser.add_argument("-o","--output", help="Output per vote data", | ||
action="store_true", default=False) | ||
parser.add_argument("-s","--session", help="Enter the Session ID, e.g. '85R'", | ||
type=str, default='85R') | ||
args = parser.parse_args() | ||
print('Starting...') | ||
if args.chamber: | ||
bill_urls = get_chamber_bills(args.chamber, args.session) | ||
if args.max > 0: | ||
bill_urls = get_chamber_bills(args.chamber, args.session)[0:args.max] | ||
write_data(scrape_chamber(args.chamber, bill_urls), args.file, | ||
args.output) | ||
else: | ||
if args.max: | ||
last = args.max//2 | ||
hbill_urls = get_chamber_bills('house', args.session)[0:last] | ||
else: | ||
hbill_urls = get_chamber_bills('house', args.session) | ||
write_data(scrape_chamber('house', hbill_urls), args.file, | ||
args.output) | ||
if args.max: | ||
last = args.max//2 | ||
sbill_urls = get_chamber_bills('senate', args.session)[0:last] | ||
else: | ||
sbill_urls = get_chamber_bills('senate', args.session) | ||
write_data(scrape_chamber('senate', sbill_urls), args.file, | ||
args.output) | ||
print('Finished.') | ||
|
||
# ---------------------------------------------- | ||
if __name__ == "__main__": | ||
# execute only if run as a script | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
""" | ||
Application services for TLO. | ||
""" | ||
from collections import namedtuple | ||
from enum import Enum | ||
from django.core.exceptions import ValidationError | ||
from influencetx.legislators import models | ||
from influencetx.tlo import fetch | ||
import logging | ||
LOG = logging.getLogger(__name__) | ||
|
||
|
||
class Action(Enum): | ||
ADDED = 'Added' | ||
FAILED = 'Failed' | ||
UPDATED = 'Updated' | ||
SKIPPED = 'Skipped' | ||
|
||
|
||
class ActionInfo(namedtuple('ActionInfo', ['action', 'instance', 'error'])): | ||
|
||
@classmethod | ||
def update(cls, action, instance): | ||
return cls(action, instance, error=None) | ||
|
||
@classmethod | ||
def fail(cls, error): | ||
return cls(action=Action.FAILED, instance=None, error=error) | ||
|
||
|
||
def sync_legislator_id(json_data, session, chamber, commit=True): | ||
"""Add legislator id from TLO. | ||
Args: | ||
json_data (dict): Legislator data from TLO. | ||
(Example: {'A2100': {'name': 'Allen', 'url'...}}) | ||
commit (bool): Save to the database. | ||
Returns: | ||
info (ActionInfo): Action performed and legislator instance. | ||
""" | ||
LOG.debug(f'Processing data: {json_data}') | ||
id = json_data['id'] | ||
name = json_data['name'] | ||
if ', ' in name: | ||
# Duplicate last name in chamber | ||
name_list = name.split(', ') | ||
last_name = name_list[0] | ||
first_name = name_list[1] | ||
match = models.Legislator.objects.filter(chamber=chamber).filter(last_name=last_name).filter(name__icontains=first_name) | ||
else: | ||
match = models.Legislator.objects.filter(chamber=chamber).filter(last_name=name) | ||
|
||
if match.exists(): | ||
#LOG.info(f'Updating legislator {match[0]}') | ||
return add_legislator_id(match[0], id, commit) | ||
else: | ||
msg = f'Failed to find legislator {name} in chamber {chamber}' | ||
return ActionInfo.fail(msg) | ||
|
||
|
||
def add_legislator_id(instance, id, commit): | ||
if instance.tx_lege_id == id: | ||
return ActionInfo.update(Action.SKIPPED, instance) | ||
else: | ||
instance.tx_lege_id = id | ||
if commit: | ||
instance.save() | ||
return ActionInfo.update(Action.ADDED, instance) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,3 +52,5 @@ requests-cache==0.4.13 | |
|
||
# Web scraper requirements | ||
bs4==0.0.1 | ||
lxml==4.5.2 | ||
argparse==1.4.0 |