From 4eb4a8ddc734f959fbc704307727920a56a8050b Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Thu, 24 Aug 2023 11:54:14 +0200
Subject: [PATCH 1/2] Add TextCat Dolly usage example.

---
 usage_examples/textcat_dolly/README.md       | 33 +++++++++++++++
 usage_examples/textcat_dolly/__init__.py     |  3 ++
 usage_examples/textcat_dolly/examples.jsonl  |  3 ++
 usage_examples/textcat_dolly/fewshot.cfg     | 28 +++++++++++++
 usage_examples/textcat_dolly/run_pipeline.py | 43 ++++++++++++++++++++
 usage_examples/textcat_dolly/zeroshot.cfg    | 22 ++++++++++
 6 files changed, 132 insertions(+)
 create mode 100644 usage_examples/textcat_dolly/README.md
 create mode 100644 usage_examples/textcat_dolly/__init__.py
 create mode 100644 usage_examples/textcat_dolly/examples.jsonl
 create mode 100644 usage_examples/textcat_dolly/fewshot.cfg
 create mode 100644 usage_examples/textcat_dolly/run_pipeline.py
 create mode 100644 usage_examples/textcat_dolly/zeroshot.cfg

diff --git a/usage_examples/textcat_dolly/README.md b/usage_examples/textcat_dolly/README.md
new file mode 100644
index 00000000..049191d0
--- /dev/null
+++ b/usage_examples/textcat_dolly/README.md
@@ -0,0 +1,33 @@
+# Using open-source Dolly models hosted on Huggingface
+
+This example shows how you can use the [open-source Dolly
+models](https://github.com/databrickslabs/dolly) hosted on Huggingface for categorizing texts in
+zero- or few-shot settings. Here, we perform binary text classification to
+determine if a given text is an `INSULT` or a `COMPLIMENT`.
+
+You can run the pipeline on a sample text via:
+
+```sh
+python run_pipeline.py [TEXT] [PATH TO CONFIG] [PATH TO FILE WITH EXAMPLES]
+```
+
+For example:
+
+```sh
+python run_pipeline.py "You look great today! Nice shirt!" ./zeroshot.cfg
+```
+or, for few-shot:
+```sh
+python run_pipeline.py "You look great today! Nice shirt!" ./fewshot.cfg ./examples.jsonl
+```
+
+You can also include examples to perform few-shot annotation. To do so, use the
+`fewshot.cfg` file instead. You can find the few-shot examples in
+the `examples.jsonl` file. Feel free to change and update it to your liking.
+We also support other file formats, including `.yml`, `.yaml` and `.json`.
+
+Finally, you can update the Dolly model in the configuration file. We're using
+[`dolly-v2-3b`](https://huggingface.co/databricks/dolly-v2-3b) by default, but
+you can change it to a larger model size like
+[`dolly-v2-7b`](https://huggingface.co/databricks/dolly-v2-7b) or
+[`dolly-v2-12b`](https://huggingface.co/databricks/dolly-v2-12b).
diff --git a/usage_examples/textcat_dolly/__init__.py b/usage_examples/textcat_dolly/__init__.py
new file mode 100644
index 00000000..06fab2f6
--- /dev/null
+++ b/usage_examples/textcat_dolly/__init__.py
@@ -0,0 +1,3 @@
+from .run_pipeline import run_pipeline
+
+__all__ = ["run_pipeline"]
diff --git a/usage_examples/textcat_dolly/examples.jsonl b/usage_examples/textcat_dolly/examples.jsonl
new file mode 100644
index 00000000..68ea1d4b
--- /dev/null
+++ b/usage_examples/textcat_dolly/examples.jsonl
@@ -0,0 +1,3 @@
+{"text":"Shall I compare thee to a summer's day? Thou art more lovely and more temperate","answer":"COMPLIMENT"}
+{"text":"That you have such a February face, so full of frost, of storm and cloudiness","answer":"INSULT"}
+{"text":"Thou art wise as thou art beautiful","answer":"COMPLIMENT"}
diff --git a/usage_examples/textcat_dolly/fewshot.cfg b/usage_examples/textcat_dolly/fewshot.cfg
new file mode 100644
index 00000000..25dda2a1
--- /dev/null
+++ b/usage_examples/textcat_dolly/fewshot.cfg
@@ -0,0 +1,28 @@
+[paths]
+examples = null
+
+[nlp]
+lang = "en"
+pipeline = ["llm"]
+batch_size = 128
+
+[components]
+
+[components.llm]
+factory = "llm"
+
+[components.llm.model]
+@llm_models = "spacy.Dolly.v1"
+name = "dolly-v2-3b"
+
+[components.llm.task]
+@llm_tasks = "spacy.TextCat.v2"
+labels = COMPLIMENT,INSULT
+exclusive_classes = true
+
+[components.llm.task.examples]
+@misc = "spacy.FewShotReader.v1"
+path = ${paths.examples}
+
+[components.llm.task.normalizer]
+@misc = "spacy.LowercaseNormalizer.v1"
diff --git a/usage_examples/textcat_dolly/run_pipeline.py b/usage_examples/textcat_dolly/run_pipeline.py
new file mode 100644
index 00000000..cd0c8baa
--- /dev/null
+++ b/usage_examples/textcat_dolly/run_pipeline.py
@@ -0,0 +1,43 @@
+import os
+from pathlib import Path
+from typing import Optional
+
+import typer
+from wasabi import msg
+
+from spacy_llm.util import assemble
+
+Arg = typer.Argument
+Opt = typer.Option
+
+
+def run_pipeline(
+    # fmt: off
+    text: str = Arg("", help="Text to perform text categorization on."),
+    config_path: Path = Arg(..., help="Path to the configuration file to use."),
+    examples_path: Optional[Path] = Arg(None, help="Path to the examples file to use (few-shot only)."),
+    verbose: bool = Opt(False, "--verbose", "-v", help="Show extra information."),
+    # fmt: on
+):
+    if not os.getenv("OPENAI_API_KEY", None):
+        msg.fail(
+            "OPENAI_API_KEY env variable was not found. "
+            "Set it by running 'export OPENAI_API_KEY=...' and try again.",
+            exits=1,
+        )
+
+    msg.text(f"Loading config from {config_path}", show=verbose)
+    nlp = assemble(
+        config_path,
+        overrides={}
+        if examples_path is None
+        else {"paths.examples": str(examples_path)},
+    )
+    doc = nlp(text)
+
+    msg.text(f"Text: {doc.text}")
+    msg.text(f"Categories: {doc.cats}")
+
+
+if __name__ == "__main__":
+    typer.run(run_pipeline)
diff --git a/usage_examples/textcat_dolly/zeroshot.cfg b/usage_examples/textcat_dolly/zeroshot.cfg
new file mode 100644
index 00000000..76e87114
--- /dev/null
+++ b/usage_examples/textcat_dolly/zeroshot.cfg
@@ -0,0 +1,22 @@
+[nlp]
+lang = "en"
+pipeline = ["llm"]
+batch_size = 128
+
+[components]
+
+[components.llm]
+factory = "llm"
+
+[components.llm.model]
+@llm_models = "spacy.Dolly.v1"
+name = "dolly-v2-3b"
+
+[components.llm.task]
+@llm_tasks = "spacy.TextCat.v2"
+labels = COMPLIMENT,INSULT
+examples = null
+exclusive_classes = true
+
+[components.llm.task.normalizer]
+@misc = "spacy.LowercaseNormalizer.v1"

From 8638e3d1cd6317d1f0de130ae862715eca1b2c13 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Thu, 24 Aug 2023 12:52:28 +0200
Subject: [PATCH 2/2] Readd TextCat test. Fix NER Dolly test.

---
 usage_examples/ner_dolly/examples.yml       |  8 ++++----
 usage_examples/tests/test_usage_examples.py | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/usage_examples/ner_dolly/examples.yml b/usage_examples/ner_dolly/examples.yml
index 7e653f7d..6d02fb1e 100644
--- a/usage_examples/ner_dolly/examples.yml
+++ b/usage_examples/ner_dolly/examples.yml
@@ -1,15 +1,15 @@
 - text: Jack and Jill went up the hill.
   entities:
-    PER:
+    PERSON:
       - Jack
       - Jill
-    LOC:
+    LOCATION:
       - hill
 - text: Jack fell down and broke his crown.
   entities:
-    PER:
+    PERSON:
       - Jack
 - text: Jill came tumbling after.
   entities:
-    PER:
+    PERSON:
       - Jill
diff --git a/usage_examples/tests/test_usage_examples.py b/usage_examples/tests/test_usage_examples.py
index 114a078f..b8735adb 100644
--- a/usage_examples/tests/test_usage_examples.py
+++ b/usage_examples/tests/test_usage_examples.py
@@ -11,6 +11,24 @@
 _USAGE_EXAMPLE_PATH = Path(__file__).parent.parent
 
 
+@pytest.mark.gpu
+@pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs GPU & CUDA")
+@pytest.mark.parametrize("config_name", ("fewshot.cfg", "zeroshot.cfg"))
+def test_textcat_dolly(config_name: str):
+    """Test NER Dolly usage example.
+    config_name (str): Name of config file to use.
+    """
+    path = _USAGE_EXAMPLE_PATH / "textcat_openai"
+    textcat_openai.run_pipeline(
+        text="text",
+        config_path=path / config_name,
+        examples_path=None
+        if config_name == "zeroshot.cfg"
+        else path / "examples.jsonl",
+        verbose=False,
+    )
+
+
 @pytest.mark.gpu
 @pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs GPU & CUDA")
 @pytest.mark.parametrize("config_name", ("fewshot.cfg", "zeroshot.cfg"))