Skip to content

Commit

Permalink
Parser Unittests
Browse files Browse the repository at this point in the history
  • Loading branch information
aarosan committed Sep 25, 2024
1 parent 43ea802 commit 2b5fff5
Showing 1 changed file with 153 additions and 90 deletions.
243 changes: 153 additions & 90 deletions src/tester/test_unittest.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
import unittest, sys, os, json, warnings, requests, logging
from unittest.mock import patch, MagicMock, mock_open
from datetime import datetime, timezone, timedelta
import tempfile
from bs4 import BeautifulSoup

current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
project_root = os.path.dirname(parent_dir)

sys.path.append(parent_dir)

# Import all of the programs modules within the parent_dir
from scraper import Scraper
from parser import Parser
from cleaner import Cleaner
from updater import Updater
from scraper import Scraper
from parser import Parser
from cleaner import Cleaner
from updater import Updater

SKIP_SLOW = os.getenv('SKIP_SLOW', 'false').lower().strip() == 'true'

Expand Down Expand Up @@ -329,93 +333,152 @@ def test_scrape_multiple_cases(self,

class ParseTestCase(unittest.TestCase):

def test_parser_defaults(self):
now = datetime.now()
now_string = now.strftime("%H:%M:%S")
# Call the function being tested
def setUp(self):

self.test_dir = tempfile.mkdtemp()
self.case_json_path = os.path.join(self.test_dir, 'hays', 'case_json')
os.makedirs(self.case_json_path, exist_ok=True)

self.mock_logger = MagicMock()
self.parser_instance = Parser()
self.case_html_path = os.path.abspath(
os.path.join(os.path.dirname(__file__), '../../resources/test_files/parser_testing')
)

@patch('parser.Parser.get_class_and_method')
def test_parser_class_and_method(self, mock_import_module):
mock_logger = MagicMock()
mock_class = MagicMock()
mock_method = MagicMock()

mock_import_module.return_value = mock_class.return_value, mock_method

parser_instance = Parser()
parser_instance.parse(county = 'hays', case_number = '51652356', test = True)

# Test #1: Check to see if there is a JSON called 51652356.json created in the correct location and that it was updated since this test started running
test_case_json_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources", 'test_files', 'test_data', 'hays', 'case_json', 'test_51652356.json')
self.assertTrue(os.path.isfile(test_case_json_path), "There is no JSON file the correct name in the correct folder.")
#This gets the time the file was last updated and converts it from unix integer to date time
test_json_updated_time = os.path.getmtime(test_case_json_path)
seconds = int(test_json_updated_time)
microseconds = int((test_json_updated_time - seconds) * 1e6)
test_json_updated_time = datetime.fromtimestamp(seconds) + timedelta(microseconds=microseconds)
test_json_updated_time_string = test_json_updated_time.strftime("%H:%M:%S")
self.assertTrue(test_json_updated_time > now, 'The JSON has not been updated since the program started running.')

# Test #2: Check to see that JSON parsed all of the necessary fields and did so properly.
#Run the json against the field validation database
def validate_field(field):

# This locates where a field should be in the JSON based on its logical level (top level, charge level, party level, etc.)
def field_locator(logical_level):
if logical_level == 'top':
location = json_dict
elif logical_level == 'party':
location = json_dict['party information']
elif logical_level == 'charge': # This only looks at the first charge in the JSON
location = json_dict['charge information'][0]
return location

def check_exists(field_name, logical_level, importance):
location = field_locator(logical_level)
# Check for the field in the expected location: Raise error if not present if field 'necessary' but only raise warning otherwise
if importance == 'necessary':
message = f"The '{field_name}' field has '{importance}' importance but is missing."
self.assertTrue(field_name in location, message)
if importance == 'high' or importance == 'medium':
if field_name not in location:
message = f"The '{field_name}' field has {importance} importance but is missing."
warnings.warn(message, UserWarning)
if importance == 'low':
# Don't bother checking.
pass

def check_length(field_name, logical_level, importance, estimated_min_length):
location = field_locator(logical_level)
#Gets the length of the field and the field's text using the dynamic location.
field_text = location[field_name]
field_length = len(field_text)
# Check for the expected length of the field: Raise error if too short if field 'necessary' but only raise warning otherwise
if importance == 'necessary':
message = f"This necessary field called '{field_name}' was expected to be more than {estimated_min_length} but it is actually {field_length}: {field_text}."
self.assertFalse(field_length < estimated_min_length, message)
if importance == 'high' or importance == 'medium':
message = f"The '{field_name}' field has an estimated minimum length of {estimated_min_length} characters, but it instead has {field_length} characters. {importance}"
if field_length < estimated_min_length:
warnings.warn(message, UserWarning)
if importance == 'low':
#Don't bother checking.
pass

check_exists(
field_name = field['name'],
logical_level = field['logical_level'],
importance = field['importance'])

check_length(
field_name = field['name'],
logical_level = field['logical_level'],
importance = field['importance'],
estimated_min_length = field['estimated_min_length'])

#Opening the test json
with open(test_case_json_path, "r") as f:
json_dict = json.load(f)

#Opening the field validation json with expected fields and their features
FIELDS_VALIDATION_DICT_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources", 'test_files', 'field_validation_list.json')
with open(FIELDS_VALIDATION_DICT_PATH, "r") as f:
FIELDS_VALIDATION_DICT = json.load(f)

for field in FIELDS_VALIDATION_DICT:
log(f"validating field: {field['name']}")
validate_field(field)
log(f'Field validation complete for {len(FIELDS_VALIDATION_DICT)} fields.')

instance, method = parser_instance.get_class_and_method(logger=mock_logger, county='hays', test=True)

self.assertEqual(instance, mock_class.return_value)
self.assertEqual(method, mock_method)

@patch('os.makedirs')
def test_parser_directories_single_file(self, mock_makedirs):
mock_logger = MagicMock()
parser_instance = Parser()
case_html_path, case_json_path = parser_instance.get_directories('hays', mock_logger, parse_single_file=True)

base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
expected_path = os.path.join(base_dir, "resources", "test_files")

self.assertEqual(case_html_path, expected_path)
self.assertEqual(case_json_path, expected_path)
mock_logger.info.assert_called()

@patch('os.makedirs')
@patch('os.path.exists', return_value=False)
def test_parser_directories_multiple_files(self, mock_exists, mock_makedirs):
mock_logger = MagicMock()
parser_instance = Parser()
case_html_path, case_json_path = parser_instance.get_directories('hays', mock_logger, parse_single_file=False)

base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
expected_html_path = os.path.join(base_dir, "data", "hays", "case_html")
expected_json_path = os.path.join(base_dir, "data", "hays", "case_json")

self.assertEqual(case_html_path, expected_html_path)
self.assertEqual(case_json_path, expected_json_path)
mock_makedirs.assert_called_once_with(expected_json_path, exist_ok=True)
mock_logger.info.assert_called()

def test_parser_list_of_single_html_file(self):
case_number = '51652356'
case_list = self.parser_instance.get_list_of_html(self.case_html_path, case_number, 'hays', self.mock_logger, parse_single_file=True)

relative_path = os.path.join(project_root, 'resources', 'test_files')

expected_path = os.path.join(relative_path, f'test_{case_number}.html')

self.assertEqual(case_list, [expected_path])
self.mock_logger.info.assert_called()

def test_parser_list_of_single_html_file_by_casenumber(self):
case_number = '51652356'

case_list = self.parser_instance.get_list_of_html(self.case_html_path, case_number, 'hays', self.mock_logger, parse_single_file=True)

relative_path = os.path.join(project_root, 'resources', 'test_files')

expected_list = [os.path.join(relative_path, f'test_{case_number}.html')]

self.assertEqual(case_list, expected_list)
self.mock_logger.info.assert_called()

@patch("os.path.join", side_effect=lambda *args: "/".join(args))
def test_parser_list_of_multiple_html_files(self, mock_join):
os.makedirs(self.case_html_path, exist_ok=True)

with open(os.path.join(self.case_html_path, 'test_1.html'), 'w') as f:
f.write('test')
with open(os.path.join(self.case_html_path, 'test_2.html'), 'w') as f:
f.write('test')

updated_html_path = os.path.join(self.case_html_path, 'multiple_html_files')
case_number = ''
case_list = self.parser_instance.get_list_of_html(updated_html_path, case_number, 'hays', self.mock_logger, parse_single_file=False)

expected_list = [
os.path.join(updated_html_path, 'test_1.html'),
os.path.join(updated_html_path, 'test_2.html')
]

self.assertEqual(set(case_list), set(expected_list))
self.mock_logger.info.assert_called()

def test_parser_get_list_of_html_error_handling(self):
invalid_path = 'invalid/path'
case_number = '12345'

with self.assertRaises(Exception):
self.parser_instance.get_list_of_html(invalid_path, case_number, 'hays', self.mock_logger, parse_single_file=False)
self.mock_logger.info.assert_called()

@patch('os.path.join')
@patch('logging.getLogger')
def test_get_html_path(self, mock_logger, mock_path_join):
updated_html_path = os.path.join(self.case_html_path, 'multiple_html_files')
case_html_file_name = "parserTest_51652356.html"
case_number = "51652356"

mock_path_join.return_value = f"{updated_html_path}/{case_html_file_name}"

result = self.parser_instance.get_html_path(updated_html_path, case_html_file_name, case_number, self.mock_logger)

self.assertEqual(result, f"{updated_html_path}/{case_html_file_name}")

@patch('builtins.open', new_callable=mock_open)
@patch('json.dumps')
@patch('parser.logging.getLogger')
def test_write_json_data(self, mock_logger, mock_json_dumps, mock_open_func):
case_json_path = "/mock/path"
case_number = "123456"
case_data = {"data": "value"}

self.parser_instance.write_json_data(case_json_path, case_number, case_data, mock_logger)

mock_open_func.assert_called_once_with(os.path.join(case_json_path, case_number + ".json"), "w")
mock_json_dumps.assert_called_once_with(case_data, indent=4)

@patch('builtins.open', new_callable=mock_open)
@patch('parser.logging.getLogger')
def test_write_error_log(self, mock_logger, mock_open_func):
county = "hays"
case_number = "123456"

self.parser_instance.write_error_log(county, case_number)

base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
error_log_path = os.path.join(base_dir, 'data', county, "cases_with_parsing_error.txt")

mock_open_func.assert_called_once_with(error_log_path, "w")

class CleanTestCase(unittest.TestCase):

Expand Down

0 comments on commit 2b5fff5

Please sign in to comment.