Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

keywords weights #58

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions exorde/extract_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
import string
from keybert import KeyBERT
from typing import Tuple
try:
import nltk
nltk.download('punkt')
Expand Down Expand Up @@ -170,27 +171,38 @@ def remove_invalid_keywords(input_list):
return output_list


def extract_keywords(translation: Translation) -> Keywords:
def extract_keywords(translation: Translation) -> Tuple[Keywords, KeywordsWeights]:
content: str = translation.translation
kx1 = _extract_keywords1(content)
keywords_weighted = list(set(kx1))
keywords_ = [e[0] for e in set(keywords_weighted)]
weights_ = [e[1] for e in set(keywords_weighted)]

keywords_.extend(_extract_keywords2(content))
kx2 = _extract_keywords_bis(content)
keywords_weighted = list(set(kx2))
keywords_.extend([e[0] for e in set(keywords_weighted)])
weights_.extend([e[1] for e in set(keywords_weighted)])
keywords_ = filter_strings(keywords_)
try:
keywords_.extend(get_ticker_symbols(content))
tickers = get_ticker_symbols(content)
keywords_.extend(tickers)
weights_.extend(1 / len(tickers) for t in tickers)
except Exception as e:
print(f"Error in ticker symbols extraction: {e}")
try:
bonus_keywords = get_extra_special_keywords(content)
keywords_.extend(bonus_keywords)
keywords_.extend(bonus_keywords)
weights_.extend(1 / len(bonus_keywords) for t in bonus_keywords)
acronyms = get_symbol_acronyms(content)
keywords_.extend(acronyms)
weights_.extend(1 / len(acronyms) for t in acronyms)
#print(get_concatened_keywords(keywords_))
keywords_ = get_concatened_keywords(keywords_)
keywords_ = remove_invalid_keywords(keywords_)
keywords_ = order_keywords(keywords_, content)
except Exception as e:
print(f"Error in advanced keywords extraction: {e}")
return Keywords(list(set(keywords_)))
output = [(i, j) for i, j in zip(Keywords(order_keywords(list(set(keywords_)), content)), weights_)]
output = Keywords([item[0] for item in output]), KeywordsWeights([item[1] for item in output])
return output
7 changes: 7 additions & 0 deletions exorde/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,13 @@ class Keywords(list, metaclass=MadType):
class TopKeywords(dict, metaclass=MadType):
top_keywords: Keywords

class KeywordsWeights(list, metaclass=MadType):
description = "The main keywords weights extracted from the content field"
annotation = list[float]

class TopKeywordsWeights(dict, metaclass=MadType):
top_keywords_weights: KeywordsWeights


class Classification(dict, metaclass=MadType):
description = "label and score of zero_shot"
Expand Down
3 changes: 2 additions & 1 deletion exorde/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ async def process(
raise err

try:
top_keywords: Keywords = extract_keywords(translation)
top_keywords: Keywords, top_keywords_weights: KeywordsWeights = extract_keywords(translation)
except Exception as err:
logging.error("An error occured populating keywords for an item")
logging.error(err)
Expand All @@ -71,6 +71,7 @@ async def process(
item=item,
translation=translation,
top_keywords=top_keywords,
top_keywords_weights=top_keywords_weights,
classification=classification,
)
except Exception as err:
Expand Down
Loading