open-austin · Matt343 · Sep 15, 2024 · Sep 15, 2024 · Sep 15, 2024 · Sep 15, 2024
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -0,0 +1,45 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Indigent Defense Stats
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.12"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with unittest
+      run: |
+        SKIP_SLOW=true python -m unittest discover -v -s ./src/tester
+    - name: Build documentation
+      run: |
+        sphinx-build -b html docs build
+
+    - uses: actions/upload-pages-artifact@v3.0.1
+      with:
+        path: build/
diff --git a/.gitignore b/.gitignore
@@ -167,3 +167,5 @@ data/
 .~lock.*
 
 .DS_Store
+
+docs/generated
diff --git a/docs/conf.py b/docs/conf.py
@@ -0,0 +1,31 @@
+import sys
+import os
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = 'indigent-defense-stats'
+copyright = '2024, Open Austin'
+author = 'Open Austin'
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+sys.path.insert(0, os.path.abspath('../src'))
+
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.autosummary']
+
+templates_path = ['_templates']
+exclude_patterns = ['src/tester']
+
+
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = 'alabaster'
+html_static_path = ['_static']
diff --git a/docs/index.rst b/docs/index.rst
@@ -0,0 +1,21 @@
+.. indigent-defense-stats documentation master file, created by
+   sphinx-quickstart on Sun Sep 15 15:44:02 2024.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+indigent-defense-stats documentation
+====================================
+
+Add your content using ``reStructuredText`` syntax. See the
+`reStructuredText <https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html>`_
+documentation for details.
+
+.. autosummary::
+   :toctree: generated
+
+   cleaner
+   orchestrator
+   parser
+   scraper
+   tools
+   updater
diff --git a/requirements.txt b/requirements.txt
@@ -5,4 +5,6 @@ python-dotenv   == 1.0.1
 requests        == 2.32.3
 retry           == 0.9.2
 statistics      == 1.0.3.5
-xxhash          == 3.5.0
+xxhash          == 3.5.0
+flake8 == 7.1.0
+Sphinx == 8.0.2
diff --git a/resources/texas_county_data.csv b/resources/texas_county_data.csv
diff --git a/src/__init__.py b/src/__init__.py
@@ -0,0 +1,6 @@
+from . import cleaner
+from . import orchestrator
+from . import parser
+from . import scraper
+from . import tools
+from . import updater
diff --git a/src/orchestrator/__init__.py b/src/orchestrator/__init__.py
@@ -1,23 +1,18 @@
 import sys, os, csv
 
-# Appends the parent directory of this handler script to the sys.path
-current_dir = os.path.dirname(os.path.abspath(__file__))
-parent_dir = os.path.dirname(current_dir)
-sys.path.append(parent_dir)
-
 # Import all of the programs modules within the parent_dir
-from scraper import scraper
-from parser import parser
-from cleaner import cleaner
-from updater import updater
+import scraper
+import parser
+import cleaner
+import updater
 
-class orchestrator:
+class Orchestrator:
     def __init__(self):
         #Sets our base parameters
         self.counties = []
         self.start_date = '2024-07-01'       #Update start date here
         self.end_date = '2024-07-01'         #Update start date here
-    def orchestrate(self, test):
+    def orchestrate(self, test: bool = False):
 
         #This open the county data CSV to see which counties should be scraped, parsed, cleaned, and updated.
         with open(
@@ -41,4 +36,5 @@ def orchestrate(self, test):
             updater(c).update() #src/updater
             print(f"Completed with scraping, parsing, cleaning, and updating of this county: {c}")
 
-orchestrator().orchestrate()
+if __name__ == '__main__':
+    Orchestrator().orchestrate()
diff --git a/src/parser/__init__.py b/src/parser/__init__.py
@@ -71,6 +71,7 @@ def get_directories(self, county, test):
 
     def get_list_of_html(self, case_html_path, case_number, county, test):
         # This will loop through the html in the folder they were scraped to.
+        os.makedirs(case_html_path, exist_ok=True)
         case_html_list = os.listdir(case_html_path)
 
         # However, if an optional case number is passed to the function, then read in the case number html file from the data folder 
@@ -104,15 +105,16 @@ def write_json_data(self, case_json_path, case_number, case_data, test):
                 file_handle.write(json.dumps(case_data))
 
     def write_error_log(self, county, case_number):
+        basepath = os.path.join(
+            os.path.dirname(__file__),
+            "..",
+            "..",
+            "data",
+            county,
+        )
+        os.makedirs(basepath, exist_ok=True)
         with open(
-            os.path.join(
-                os.path.dirname(__file__),
-                "..",
-                "..",
-                "data",
-                county,
-                "cases_with_parsing_error.txt",
-            ),
+            os.path.join(basepath, "cases_with_parsing_error.txt"),
             "w",
         ) as file_handle:
             file_handle.write(case_number + "\n")        
@@ -148,8 +150,13 @@ def parse(self, county, case_number, test): #remove the test value here and just
                 case_html_file_path = self.get_html_path(case_html_path, case_html_file_name, case_number, test)
 
                 print(f"{case_number} - parsing")
-                with open(case_html_file_path, "r") as file_handle:
-                    case_soup = BeautifulSoup(file_handle, "html.parser", from_encoding="UTF-8")
+                # strip out invalid utf-8 characters
+                with open(case_html_file_path, "r", encoding='utf-8', errors='ignore') as file_handle:
+                    try:
+                        case_soup = BeautifulSoup(file_handle, "html.parser", from_encoding="UTF-8")
+                    except Exception as e:
+                        print(f'error building beautiful soup for file {case_html_file_path}, {e}')
+                        raise e
 
                 # Get the county-specific parser class and method
                 parser_instance, parser_function = self.get_class_and_method(county=county)

diff --git a/src/scraper/__init__.py b/src/scraper/__init__.py
@@ -13,6 +13,7 @@
 import importlib
 
 class Scraper:
+    """Scrape Odyssey html files into an output folder"""
     def __init__(self):
         pass
 

diff --git a/src/tester/test_unittest.py b/src/tester/test_unittest.py
@@ -12,6 +12,8 @@
 from cleaner  import Cleaner
 from updater  import Updater
 
+SKIP_SLOW = os.getenv('SKIP_SLOW', 'false').lower().strip() == 'true'
+
 def log(message, level='INFO'): # Provide message and info level (optional, defaulting to info)
     # configure the logger
     log = logging.getLogger(name="pid: " + str(os.getpid()))
@@ -252,6 +254,7 @@ def test_scrape_results_page(self,
         #def scrape_case_data_pre2017()
         #def scrape_case_data_post2017()
 
+    @unittest.skipIf(SKIP_SLOW, "slow")
     def test_scrape_multiple_cases(self, 
                           county = 'hays',
                           odyssey_version = 2003,