Skip to content

Commit

Permalink
Patching HuggingFace offline mode & cap keyword max length (#52)
Browse files Browse the repository at this point in the history
* Update extract_keywords.py

added max length for keywords

* Hugging face offline mode (#53)

* Update spotting.py

* Update pre_install.py

* Update spotting.py

---------

Co-authored-by: Mathias Dail <93382891+MathiasExorde@users.noreply.github.com>

* Update setup.py

---------

Co-authored-by: Mathias Dail <93382891+MathiasExorde@users.noreply.github.com>
  • Loading branch information
6r17 and MathiasExorde authored Feb 12, 2024
1 parent 1b50e85 commit f10cadb
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 8 deletions.
5 changes: 4 additions & 1 deletion exorde/extract_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
print("nltk already downloaded or error")
from exorde.models import Keywords, Translation

MAX_KEYWORD_LENGTH = 50

def is_good_1gram(word):
special_chars = set(string.punctuation.replace("-", ""))
Expand Down Expand Up @@ -154,10 +155,12 @@ def remove_invalid_keywords(input_list):
for s in input_list:
# remove any double slash and any url. ex: "//CONNECT.COM" and "https://CONNECT.COM"
s = re.sub(r'//|https?:\/\/.*[\r\n]*', '', s)
if len(s) > 2:
# Add check for length of the keyword
if 2 < len(s) and len(s) <= MAX_KEYWORD_LENGTH and s not in output_list:
output_list.append(s)
return output_list


def extract_keywords(translation: Translation) -> Keywords:
content: str = translation.translation
kx1 = _extract_keywords1(content)
Expand Down
32 changes: 27 additions & 5 deletions exorde/pre_install.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,57 @@
from transformers import AutoModel, AutoTokenizer

import os
from argostranslate import package
from typing import cast
import logging
from wtpsplit import WtP
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, pipeline
from huggingface_hub import hf_hub_download
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from ftlangdetect import detect as ft_test_detect

print("importing wtpsplit....")
wtp = WtP("wtp-canine-s-1l")
print("init ftlangdetect")
test_lang_detect = ft_test_detect("test")

models = [
"MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",
"SamLowe/roberta-base-go_emotions",
"cardiffnlp/twitter-roberta-base-irony",
"salesken/query_wellformedness_score",
"marieke93/MiniLM-evidence-types",
"alimazhar-110/website_classification",
"mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
"lxyuan/distilbert-base-multilingual-cased-sentiments-student"
"lxyuan/distilbert-base-multilingual-cased-sentiments-student",
"bert-large-uncased",
]

def install_hugging_face_models(models):
for model in models:
print(f"installing model {model}...")
__tokenizer__ = AutoTokenizer.from_pretrained(model)
model = AutoModel.from_pretrained(model)

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
install_hugging_face_models(models)

cache_dir = os.path.join(os.getenv('HOME'), '.cache', 'huggingface', 'hub')

print("install emoji_lexicon")
emoji_lexicon = hf_hub_download(
repo_id="ExordeLabs/SentimentDetection",
filename="emoji_unic_lexicon.json",
cache_dir=cache_dir
)
print(f"emoji lexicon downloaded : {emoji_lexicon}")
print("install loughran_dict")
loughran_dict = hf_hub_download(
repo_id="ExordeLabs/SentimentDetection",
filename="loughran_dict.json",
cache_dir=cache_dir
)
print(f"loughran downloaded : {loughran_dict}")
content_list = os.listdir(cache_dir)
print("Contents of the cache folder:", content_list)

### install (pre install) models target for English, and exclude low frequency ones to not overload the isntall
def is_english_target(s):
return '→ English' in s
Expand Down
18 changes: 17 additions & 1 deletion exorde/spotting.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
import argparse

import os
from exorde.models import Processed

from typing import Union
Expand Down Expand Up @@ -146,6 +146,22 @@ async def spotting(
batch, static_configuration
)
logging.info("Successfully processed batch")
###############################################
### SETTING HUGGINFACE HUB TO OFFLINE MODE
##### NOW THAT ALL MODELS ARE PROVEN OK
# check if TRANSFORMERS_OFFLINE env var is 0
# if so, set it to 1 and print the change

# Check if the TRANSFORMERS_OFFLINE environment variable is set and not equal to '1'
if os.environ.get("TRANSFORMERS_OFFLINE") != "1":
# Set the TRANSFORMERS_OFFLINE environment variable to '1'
os.environ["TRANSFORMERS_OFFLINE"] = "1"
logging.info("TRANSFORMERS_OFFLINE environment variable was set to 1.")
else:
# If the variable is already set to '1', inform the user
logging.info("[HUGGING FACE MODE] OFFLINE")

###############################################
await websocket_send(
{
"jobs": {
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="exorde",
version="v2.5.2",
version="v2.5.3",
author="Exorde Labs",
author_email="hello@exordelabs.com",
description="The AI-based client to mine data and power the Exorde Network",
Expand Down

0 comments on commit f10cadb

Please sign in to comment.