Skip to content

Commit

Permalink
Update extract_keywords.py (#54)
Browse files Browse the repository at this point in the history
  • Loading branch information
MathiasExorde authored Feb 28, 2024
1 parent cf16f1d commit bc9101b
Showing 1 changed file with 11 additions and 2 deletions.
13 changes: 11 additions & 2 deletions exorde/extract_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,12 @@ def is_valid_keyword(word):
isalpha_count = sum(1 for char in word if char.isalpha())
total_chars = len(word)
punctuation = re.compile(r'[^\w\s,]')
return (uppercase_count / total_chars >= 0.3) and (punctuation.search(word) is not None) and (isalpha_count>1)
# Prevent division by zero
if total_chars > 0:
return (uppercase_count / total_chars >= 0.3) and (punctuation.search(word) is not None) and (isalpha_count>1)
else:
return False


words = nltk.word_tokenize(text)
filtered_words = filter(is_valid_keyword, words)
Expand Down Expand Up @@ -138,7 +143,11 @@ def is_valid_acronym(word):
uppercase_count = sum(1 for char in word if char.isupper())
isalpha_count = sum(1 for char in word if char.isalpha())
total_chars = len(word)
return (uppercase_count / total_chars >= 0.3) and (isalpha_count>=1) and len(word) >= 2
# Prevent division by zero
if total_chars > 0:
return (uppercase_count / total_chars >= 0.3) and (isalpha_count>=1) and len(word) >= 2
else:
return False

# split by space and special punctuation: comma, point, period
# not nltk tokenize
Expand Down

0 comments on commit bc9101b

Please sign in to comment.