Patching HuggingFace offline mode & cap keyword max length (#52)

* Update extract_keywords.py added max length for keywords * Hugging face offline mode (#53) * Update spotting.py * Update pre_install.py * Update spotting.py --------- Co-authored-by: Mathias Dail <93382891+MathiasExorde@users.noreply.github.com> * Update setup.py --------- Co-authored-by: Mathias Dail <93382891+MathiasExorde@users.noreply.github.com>
exorde-labs · Feb 12, 2024 · f10cadb · f10cadb
1 parent 1b50e85
commit f10cadb
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 8 deletions.
diff --git a/exorde/extract_keywords.py b/exorde/extract_keywords.py
@@ -9,6 +9,7 @@
     print("nltk already downloaded or error")
 from exorde.models import Keywords, Translation
 
+MAX_KEYWORD_LENGTH = 50
 
 def is_good_1gram(word):
     special_chars = set(string.punctuation.replace("-", ""))
@@ -154,10 +155,12 @@ def remove_invalid_keywords(input_list):
     for s in input_list:
         # remove any double slash and any url. ex: "//CONNECT.COM" and "https://CONNECT.COM"
         s = re.sub(r'//|https?:\/\/.*[\r\n]*', '', s)
-        if len(s) > 2:
+        # Add check for length of the keyword
+        if 2 < len(s) and len(s) <= MAX_KEYWORD_LENGTH and s not in output_list:
             output_list.append(s)
     return output_list
 
+
 def extract_keywords(translation: Translation) -> Keywords:
     content: str = translation.translation       
     kx1 = _extract_keywords1(content)

diff --git a/exorde/pre_install.py b/exorde/pre_install.py
@@ -1,35 +1,57 @@
 from transformers import AutoModel, AutoTokenizer
-
+import os
 from argostranslate import package
 from typing import cast
 import logging
-from wtpsplit import WtP
 from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer, pipeline
 from huggingface_hub import hf_hub_download
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+from ftlangdetect import detect as ft_test_detect
 
-print("importing wtpsplit....")
-wtp = WtP("wtp-canine-s-1l")
+print("init ftlangdetect")
+test_lang_detect = ft_test_detect("test")
 
 models = [
+    "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",
     "SamLowe/roberta-base-go_emotions",
     "cardiffnlp/twitter-roberta-base-irony",
     "salesken/query_wellformedness_score",
     "marieke93/MiniLM-evidence-types",
     "alimazhar-110/website_classification",
     "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
-    "lxyuan/distilbert-base-multilingual-cased-sentiments-student"
+    "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
+    "bert-large-uncased",
 ]
 
 def install_hugging_face_models(models):
     for model in models:
+        print(f"installing model {model}...")        
         __tokenizer__ = AutoTokenizer.from_pretrained(model)
         model = AutoModel.from_pretrained(model)
 
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 install_hugging_face_models(models)
 
+cache_dir = os.path.join(os.getenv('HOME'), '.cache', 'huggingface', 'hub')
+
+print("install emoji_lexicon")
+emoji_lexicon = hf_hub_download(
+    repo_id="ExordeLabs/SentimentDetection",
+    filename="emoji_unic_lexicon.json",
+    cache_dir=cache_dir
+)
+print(f"emoji lexicon downloaded : {emoji_lexicon}")
+print("install loughran_dict")
+loughran_dict = hf_hub_download(
+    repo_id="ExordeLabs/SentimentDetection",
+    filename="loughran_dict.json",
+    cache_dir=cache_dir
+)
+print(f"loughran downloaded : {loughran_dict}")
+content_list = os.listdir(cache_dir)
+print("Contents of the cache folder:", content_list)
+
 ### install (pre install) models target for English, and exclude low frequency ones to not overload the isntall
 def is_english_target(s):
     return '→ English' in s

diff --git a/exorde/spotting.py b/exorde/spotting.py
@@ -1,6 +1,6 @@
 import logging
 import argparse
-
+import os
 from exorde.models import Processed
 
 from typing import Union
@@ -146,6 +146,22 @@ async def spotting(
             batch, static_configuration
         )
         logging.info("Successfully processed batch")
+        ###############################################
+        ###   SETTING HUGGINFACE HUB TO OFFLINE MODE
+        ##### NOW THAT ALL MODELS ARE PROVEN OK
+        # check if TRANSFORMERS_OFFLINE env var is 0
+        # if so, set it to 1 and print the change
+
+        # Check if the TRANSFORMERS_OFFLINE environment variable is set and not equal to '1'
+        if os.environ.get("TRANSFORMERS_OFFLINE") != "1":
+            # Set the TRANSFORMERS_OFFLINE environment variable to '1'
+            os.environ["TRANSFORMERS_OFFLINE"] = "1"
+            logging.info("TRANSFORMERS_OFFLINE environment variable was set to 1.")
+        else:
+            # If the variable is already set to '1', inform the user
+            logging.info("[HUGGING FACE MODE] OFFLINE")
+
+        ###############################################
         await websocket_send(
             {
                 "jobs": {

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="exorde",
-    version="v2.5.2",
+    version="v2.5.3",
     author="Exorde Labs",
     author_email="hello@exordelabs.com",
     description="The AI-based client to mine data and power the Exorde Network",