From 8820019d4b00a83d68e2448ea74ff89b654f2c71 Mon Sep 17 00:00:00 2001 From: Alin Vetian Date: Tue, 22 Oct 2024 17:51:03 +0300 Subject: [PATCH] updated PII scanner code --- .../pii_scan_lambda_sender_service.rb | 2 +- script/sensitive_data/lambda.py | 31 +++++++++---------- script/sensitive_data/logger.py | 13 ++++++++ 3 files changed, 29 insertions(+), 17 deletions(-) create mode 100644 script/sensitive_data/logger.py diff --git a/app/services/stash_engine/pii_scan_lambda_sender_service.rb b/app/services/stash_engine/pii_scan_lambda_sender_service.rb index 173060af4..706e4e961 100644 --- a/app/services/stash_engine/pii_scan_lambda_sender_service.rb +++ b/app/services/stash_engine/pii_scan_lambda_sender_service.rb @@ -2,7 +2,7 @@ module StashEngine class PiiScanLambdaSenderService < BaseSenderService def call - trigger_call('excelToCsv') + trigger_call('sensitive_data_scan') end private diff --git a/script/sensitive_data/lambda.py b/script/sensitive_data/lambda.py index 55fe3706e..c24e39b02 100644 --- a/script/sensitive_data/lambda.py +++ b/script/sensitive_data/lambda.py @@ -1,18 +1,24 @@ import re import json import time -import pandas as pd import requests import os from urllib.parse import urlparse from document_scanner import DocumentScanner +from logger import Logger from response import Response # event json has these params passed in: download_url, callback_url, file_mime_type, token def lambda_handler(event, context): download_url = event['download_url'] - file_extension = get_file_extension(download_url) + file_path = get_file_path(download_url) + logger = Logger(file_path) + + logger.log(f"parsing file: {get_file_path(download_url)}") + logger.log(f"callback_url: {event['callback_url']}") + + file_extension = get_file_extension(download_url) if file_extension in ['.txt', '.log', '.csv']: scanner = DocumentScanner(download_url) response = scanner.scan() @@ -28,6 +34,7 @@ def lambda_handler(event, context): response = file_not_supported_response() report_status = "noissues" + logger.log(f"end parsing with status: {report_status}") # Send report to callback_url update(token=event["token"], status=report_status, report=json.dumps({'report': response.__dict__}), callback=event['callback_url']) @@ -39,25 +46,17 @@ def get_file_extension(url): file_path = parsed_url.path return os.path.splitext(file_path)[1] +def get_file_path(url): + parsed_url = urlparse(url) + # Reconstruct the base URL without the query parameters + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" + return base_url + def file_not_supported_response(): response = Response() response.errors = ['File type not supported'] return response -# Print readable response -# For debugging purposes -def print_response(response): - response = json.loads(response) - print(f"Valid: {response['valid']}") - - pattern_occurrences = response['issues'] - if pattern_occurrences: - for issue in pattern_occurrences: - print(f"Line {issue['line_number']}: {issue['pattern']} -> {issue['matches']}") - - if response['errors']: - print(f"Errors: {response['errors']}") - # tries to upload it to our API def update(token, status, report, callback): headers = {'Authorization': f'Bearer {token}'} diff --git a/script/sensitive_data/logger.py b/script/sensitive_data/logger.py new file mode 100644 index 000000000..32e05d2c7 --- /dev/null +++ b/script/sensitive_data/logger.py @@ -0,0 +1,13 @@ +import hashlib + +class Logger: + def __init__(self, url): + self.url = url + self.token = self.generate_worker_token() + + def log(self, message): + print(f"{self.token} - {message}") + + def generate_worker_token(self): + md5_hash = hashlib.md5(self.url.encode()) + return md5_hash.hexdigest()