Skip to content

Commit

Permalink
updated PII scanner code
Browse files Browse the repository at this point in the history
  • Loading branch information
alinvetian committed Oct 22, 2024
1 parent c6b39ba commit 8820019
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module StashEngine
class PiiScanLambdaSenderService < BaseSenderService

def call
trigger_call('excelToCsv')
trigger_call('sensitive_data_scan')
end

private
Expand Down
31 changes: 15 additions & 16 deletions script/sensitive_data/lambda.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
import re
import json
import time
import pandas as pd
import requests
import os
from urllib.parse import urlparse
from document_scanner import DocumentScanner
from logger import Logger
from response import Response

# event json has these params passed in: download_url, callback_url, file_mime_type, token
def lambda_handler(event, context):
download_url = event['download_url']
file_extension = get_file_extension(download_url)

file_path = get_file_path(download_url)
logger = Logger(file_path)

logger.log(f"parsing file: {get_file_path(download_url)}")
logger.log(f"callback_url: {event['callback_url']}")

file_extension = get_file_extension(download_url)
if file_extension in ['.txt', '.log', '.csv']:
scanner = DocumentScanner(download_url)
response = scanner.scan()
Expand All @@ -28,6 +34,7 @@ def lambda_handler(event, context):
response = file_not_supported_response()
report_status = "noissues"

logger.log(f"end parsing with status: {report_status}")
# Send report to callback_url
update(token=event["token"], status=report_status, report=json.dumps({'report': response.__dict__}), callback=event['callback_url'])

Expand All @@ -39,25 +46,17 @@ def get_file_extension(url):
file_path = parsed_url.path
return os.path.splitext(file_path)[1]

def get_file_path(url):
parsed_url = urlparse(url)
# Reconstruct the base URL without the query parameters
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
return base_url

def file_not_supported_response():
response = Response()
response.errors = ['File type not supported']
return response

# Print readable response
# For debugging purposes
def print_response(response):
response = json.loads(response)
print(f"Valid: {response['valid']}")

pattern_occurrences = response['issues']
if pattern_occurrences:
for issue in pattern_occurrences:
print(f"Line {issue['line_number']}: {issue['pattern']} -> {issue['matches']}")

if response['errors']:
print(f"Errors: {response['errors']}")

# tries to upload it to our API
def update(token, status, report, callback):
headers = {'Authorization': f'Bearer {token}'}
Expand Down
13 changes: 13 additions & 0 deletions script/sensitive_data/logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import hashlib

class Logger:
def __init__(self, url):
self.url = url
self.token = self.generate_worker_token()

def log(self, message):
print(f"{self.token} - {message}")

def generate_worker_token(self):
md5_hash = hashlib.md5(self.url.encode())
return md5_hash.hexdigest()

0 comments on commit 8820019

Please sign in to comment.