From 4eb4a8ddc734f959fbc704307727920a56a8050b Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 24 Aug 2023 11:54:14 +0200 Subject: [PATCH 1/2] Add TextCat Dolly usage example. --- usage_examples/textcat_dolly/README.md | 33 +++++++++++++++ usage_examples/textcat_dolly/__init__.py | 3 ++ usage_examples/textcat_dolly/examples.jsonl | 3 ++ usage_examples/textcat_dolly/fewshot.cfg | 28 +++++++++++++ usage_examples/textcat_dolly/run_pipeline.py | 43 ++++++++++++++++++++ usage_examples/textcat_dolly/zeroshot.cfg | 22 ++++++++++ 6 files changed, 132 insertions(+) create mode 100644 usage_examples/textcat_dolly/README.md create mode 100644 usage_examples/textcat_dolly/__init__.py create mode 100644 usage_examples/textcat_dolly/examples.jsonl create mode 100644 usage_examples/textcat_dolly/fewshot.cfg create mode 100644 usage_examples/textcat_dolly/run_pipeline.py create mode 100644 usage_examples/textcat_dolly/zeroshot.cfg diff --git a/usage_examples/textcat_dolly/README.md b/usage_examples/textcat_dolly/README.md new file mode 100644 index 00000000..049191d0 --- /dev/null +++ b/usage_examples/textcat_dolly/README.md @@ -0,0 +1,33 @@ +# Using open-source Dolly models hosted on Huggingface + +This example shows how you can use the [open-source Dolly +models](https://github.com/databrickslabs/dolly) hosted on Huggingface for categorizing texts in +zero- or few-shot settings. Here, we perform binary text classification to +determine if a given text is an `INSULT` or a `COMPLIMENT`. + +You can run the pipeline on a sample text via: + +```sh +python run_pipeline.py [TEXT] [PATH TO CONFIG] [PATH TO FILE WITH EXAMPLES] +``` + +For example: + +```sh +python run_pipeline.py "You look great today! Nice shirt!" ./zeroshot.cfg +``` +or, for few-shot: +```sh +python run_pipeline.py "You look great today! Nice shirt!" ./fewshot.cfg ./examples.jsonl +``` + +You can also include examples to perform few-shot annotation. To do so, use the +`fewshot.cfg` file instead. You can find the few-shot examples in +the `examples.jsonl` file. Feel free to change and update it to your liking. +We also support other file formats, including `.yml`, `.yaml` and `.json`. + +Finally, you can update the Dolly model in the configuration file. We're using +[`dolly-v2-3b`](https://huggingface.co/databricks/dolly-v2-3b) by default, but +you can change it to a larger model size like +[`dolly-v2-7b`](https://huggingface.co/databricks/dolly-v2-7b) or +[`dolly-v2-12b`](https://huggingface.co/databricks/dolly-v2-12b). diff --git a/usage_examples/textcat_dolly/__init__.py b/usage_examples/textcat_dolly/__init__.py new file mode 100644 index 00000000..06fab2f6 --- /dev/null +++ b/usage_examples/textcat_dolly/__init__.py @@ -0,0 +1,3 @@ +from .run_pipeline import run_pipeline + +__all__ = ["run_pipeline"] diff --git a/usage_examples/textcat_dolly/examples.jsonl b/usage_examples/textcat_dolly/examples.jsonl new file mode 100644 index 00000000..68ea1d4b --- /dev/null +++ b/usage_examples/textcat_dolly/examples.jsonl @@ -0,0 +1,3 @@ +{"text":"Shall I compare thee to a summer's day? Thou art more lovely and more temperate","answer":"COMPLIMENT"} +{"text":"That you have such a February face, so full of frost, of storm and cloudiness","answer":"INSULT"} +{"text":"Thou art wise as thou art beautiful","answer":"COMPLIMENT"} diff --git a/usage_examples/textcat_dolly/fewshot.cfg b/usage_examples/textcat_dolly/fewshot.cfg new file mode 100644 index 00000000..25dda2a1 --- /dev/null +++ b/usage_examples/textcat_dolly/fewshot.cfg @@ -0,0 +1,28 @@ +[paths] +examples = null + +[nlp] +lang = "en" +pipeline = ["llm"] +batch_size = 128 + +[components] + +[components.llm] +factory = "llm" + +[components.llm.model] +@llm_models = "spacy.Dolly.v1" +name = "dolly-v2-3b" + +[components.llm.task] +@llm_tasks = "spacy.TextCat.v2" +labels = COMPLIMENT,INSULT +exclusive_classes = true + +[components.llm.task.examples] +@misc = "spacy.FewShotReader.v1" +path = ${paths.examples} + +[components.llm.task.normalizer] +@misc = "spacy.LowercaseNormalizer.v1" diff --git a/usage_examples/textcat_dolly/run_pipeline.py b/usage_examples/textcat_dolly/run_pipeline.py new file mode 100644 index 00000000..cd0c8baa --- /dev/null +++ b/usage_examples/textcat_dolly/run_pipeline.py @@ -0,0 +1,43 @@ +import os +from pathlib import Path +from typing import Optional + +import typer +from wasabi import msg + +from spacy_llm.util import assemble + +Arg = typer.Argument +Opt = typer.Option + + +def run_pipeline( + # fmt: off + text: str = Arg("", help="Text to perform text categorization on."), + config_path: Path = Arg(..., help="Path to the configuration file to use."), + examples_path: Optional[Path] = Arg(None, help="Path to the examples file to use (few-shot only)."), + verbose: bool = Opt(False, "--verbose", "-v", help="Show extra information."), + # fmt: on +): + if not os.getenv("OPENAI_API_KEY", None): + msg.fail( + "OPENAI_API_KEY env variable was not found. " + "Set it by running 'export OPENAI_API_KEY=...' and try again.", + exits=1, + ) + + msg.text(f"Loading config from {config_path}", show=verbose) + nlp = assemble( + config_path, + overrides={} + if examples_path is None + else {"paths.examples": str(examples_path)}, + ) + doc = nlp(text) + + msg.text(f"Text: {doc.text}") + msg.text(f"Categories: {doc.cats}") + + +if __name__ == "__main__": + typer.run(run_pipeline) diff --git a/usage_examples/textcat_dolly/zeroshot.cfg b/usage_examples/textcat_dolly/zeroshot.cfg new file mode 100644 index 00000000..76e87114 --- /dev/null +++ b/usage_examples/textcat_dolly/zeroshot.cfg @@ -0,0 +1,22 @@ +[nlp] +lang = "en" +pipeline = ["llm"] +batch_size = 128 + +[components] + +[components.llm] +factory = "llm" + +[components.llm.model] +@llm_models = "spacy.Dolly.v1" +name = "dolly-v2-3b" + +[components.llm.task] +@llm_tasks = "spacy.TextCat.v2" +labels = COMPLIMENT,INSULT +examples = null +exclusive_classes = true + +[components.llm.task.normalizer] +@misc = "spacy.LowercaseNormalizer.v1" From 8638e3d1cd6317d1f0de130ae862715eca1b2c13 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 24 Aug 2023 12:52:28 +0200 Subject: [PATCH 2/2] Readd TextCat test. Fix NER Dolly test. --- usage_examples/ner_dolly/examples.yml | 8 ++++---- usage_examples/tests/test_usage_examples.py | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/usage_examples/ner_dolly/examples.yml b/usage_examples/ner_dolly/examples.yml index 7e653f7d..6d02fb1e 100644 --- a/usage_examples/ner_dolly/examples.yml +++ b/usage_examples/ner_dolly/examples.yml @@ -1,15 +1,15 @@ - text: Jack and Jill went up the hill. entities: - PER: + PERSON: - Jack - Jill - LOC: + LOCATION: - hill - text: Jack fell down and broke his crown. entities: - PER: + PERSON: - Jack - text: Jill came tumbling after. entities: - PER: + PERSON: - Jill diff --git a/usage_examples/tests/test_usage_examples.py b/usage_examples/tests/test_usage_examples.py index 114a078f..b8735adb 100644 --- a/usage_examples/tests/test_usage_examples.py +++ b/usage_examples/tests/test_usage_examples.py @@ -11,6 +11,24 @@ _USAGE_EXAMPLE_PATH = Path(__file__).parent.parent +@pytest.mark.gpu +@pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs GPU & CUDA") +@pytest.mark.parametrize("config_name", ("fewshot.cfg", "zeroshot.cfg")) +def test_textcat_dolly(config_name: str): + """Test NER Dolly usage example. + config_name (str): Name of config file to use. + """ + path = _USAGE_EXAMPLE_PATH / "textcat_openai" + textcat_openai.run_pipeline( + text="text", + config_path=path / config_name, + examples_path=None + if config_name == "zeroshot.cfg" + else path / "examples.jsonl", + verbose=False, + ) + + @pytest.mark.gpu @pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs GPU & CUDA") @pytest.mark.parametrize("config_name", ("fewshot.cfg", "zeroshot.cfg"))