Skip to content

Commit

Permalink
Setup github actions (#146)
Browse files Browse the repository at this point in the history
* Setup github actions

* Update requirements.txt

* Update python-app.yml

* Fix github action syntax

* Skip slow test for now

* Update .github/workflows/python-app.yml

Co-authored-by: Dan Minshew <ofenixculpa@gmail.com>

* Update requirements.txt

Co-authored-by: Dan Minshew <ofenixculpa@gmail.com>

* Revert "format the county data (#145)"

This reverts commit 63202c8.

* Fix tests

* Fix missing file error

* Add sphinx

* Build docs site as a github page

* try fixing build

* Set sphinx version more explicitly

* Debugging build

* Abandon sphinx-action step

---------

Co-authored-by: Dan Minshew <ofenixculpa@gmail.com>
  • Loading branch information
Matt343 and newswim authored Sep 15, 2024
1 parent 66f1f46 commit 45e3bf6
Show file tree
Hide file tree
Showing 11 changed files with 392 additions and 278 deletions.
45 changes: 45 additions & 0 deletions .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: Indigent Defense Stats

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

permissions:
contents: read

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up Python 3.12
uses: actions/setup-python@v3
with:
python-version: "3.12"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with unittest
run: |
SKIP_SLOW=true python -m unittest discover -v -s ./src/tester
- name: Build documentation
run: |
sphinx-build -b html docs build
- uses: actions/upload-pages-artifact@v3.0.1
with:
path: build/
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,5 @@ data/
.~lock.*

.DS_Store

docs/generated
31 changes: 31 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import sys
import os
# Configuration file for the Sphinx documentation builder.
#
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information

project = 'indigent-defense-stats'
copyright = '2024, Open Austin'
author = 'Open Austin'

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

sys.path.insert(0, os.path.abspath('../src'))

extensions = ['sphinx.ext.autodoc', 'sphinx.ext.autosummary']

templates_path = ['_templates']
exclude_patterns = ['src/tester']



# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

html_theme = 'alabaster'
html_static_path = ['_static']
21 changes: 21 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
.. indigent-defense-stats documentation master file, created by
sphinx-quickstart on Sun Sep 15 15:44:02 2024.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
indigent-defense-stats documentation
====================================

Add your content using ``reStructuredText`` syntax. See the
`reStructuredText <https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html>`_
documentation for details.

.. autosummary::
:toctree: generated

cleaner
orchestrator
parser
scraper
tools
updater
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ python-dotenv == 1.0.1
requests == 2.32.3
retry == 0.9.2
statistics == 1.0.3.5
xxhash == 3.5.0
xxhash == 3.5.0
flake8 == 7.1.0
Sphinx == 8.0.2
510 changes: 255 additions & 255 deletions resources/texas_county_data.csv

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from . import cleaner
from . import orchestrator
from . import parser
from . import scraper
from . import tools
from . import updater
20 changes: 8 additions & 12 deletions src/orchestrator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,18 @@
import sys, os, csv

# Appends the parent directory of this handler script to the sys.path
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

# Import all of the programs modules within the parent_dir
from scraper import scraper
from parser import parser
from cleaner import cleaner
from updater import updater
import scraper
import parser
import cleaner
import updater

class orchestrator:
class Orchestrator:
def __init__(self):
#Sets our base parameters
self.counties = []
self.start_date = '2024-07-01' #Update start date here
self.end_date = '2024-07-01' #Update start date here
def orchestrate(self, test):
def orchestrate(self, test: bool = False):

#This open the county data CSV to see which counties should be scraped, parsed, cleaned, and updated.
with open(
Expand All @@ -41,4 +36,5 @@ def orchestrate(self, test):
updater(c).update() #src/updater
print(f"Completed with scraping, parsing, cleaning, and updating of this county: {c}")

orchestrator().orchestrate()
if __name__ == '__main__':
Orchestrator().orchestrate()
27 changes: 17 additions & 10 deletions src/parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def get_directories(self, county, test):

def get_list_of_html(self, case_html_path, case_number, county, test):
# This will loop through the html in the folder they were scraped to.
os.makedirs(case_html_path, exist_ok=True)
case_html_list = os.listdir(case_html_path)

# However, if an optional case number is passed to the function, then read in the case number html file from the data folder
Expand Down Expand Up @@ -104,15 +105,16 @@ def write_json_data(self, case_json_path, case_number, case_data, test):
file_handle.write(json.dumps(case_data))

def write_error_log(self, county, case_number):
basepath = os.path.join(
os.path.dirname(__file__),
"..",
"..",
"data",
county,
)
os.makedirs(basepath, exist_ok=True)
with open(
os.path.join(
os.path.dirname(__file__),
"..",
"..",
"data",
county,
"cases_with_parsing_error.txt",
),
os.path.join(basepath, "cases_with_parsing_error.txt"),
"w",
) as file_handle:
file_handle.write(case_number + "\n")
Expand Down Expand Up @@ -148,8 +150,13 @@ def parse(self, county, case_number, test): #remove the test value here and just
case_html_file_path = self.get_html_path(case_html_path, case_html_file_name, case_number, test)

print(f"{case_number} - parsing")
with open(case_html_file_path, "r") as file_handle:
case_soup = BeautifulSoup(file_handle, "html.parser", from_encoding="UTF-8")
# strip out invalid utf-8 characters
with open(case_html_file_path, "r", encoding='utf-8', errors='ignore') as file_handle:
try:
case_soup = BeautifulSoup(file_handle, "html.parser", from_encoding="UTF-8")
except Exception as e:
print(f'error building beautiful soup for file {case_html_file_path}, {e}')
raise e

# Get the county-specific parser class and method
parser_instance, parser_function = self.get_class_and_method(county=county)
Expand Down
1 change: 1 addition & 0 deletions src/scraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import importlib

class Scraper:
"""Scrape Odyssey html files into an output folder"""
def __init__(self):
pass

Expand Down
3 changes: 3 additions & 0 deletions src/tester/test_unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from cleaner import Cleaner
from updater import Updater

SKIP_SLOW = os.getenv('SKIP_SLOW', 'false').lower().strip() == 'true'

def log(message, level='INFO'): # Provide message and info level (optional, defaulting to info)
# configure the logger
log = logging.getLogger(name="pid: " + str(os.getpid()))
Expand Down Expand Up @@ -252,6 +254,7 @@ def test_scrape_results_page(self,
#def scrape_case_data_pre2017()
#def scrape_case_data_post2017()

@unittest.skipIf(SKIP_SLOW, "slow")
def test_scrape_multiple_cases(self,
county = 'hays',
odyssey_version = 2003,
Expand Down

0 comments on commit 45e3bf6

Please sign in to comment.