From 0bcbf66c5b4b91ebc02f2e97219fd46401e38650 Mon Sep 17 00:00:00 2001
From: Celina Hanouti <hanouticelina@gmail.com>
Date: Wed, 16 Oct 2024 17:32:33 +0200
Subject: [PATCH] [Inference API] Add image-text-to-text task and fix
 `generate` script (#1440)

* Add image-text-to-text task and fix generate script

* Run generate script

* fix chat-completion and image-text-to-text docs

* fix typo

* Fix chat completion package reference links

* regenerate inference api docs
---
 docs/api-inference/_toctree.yml               |   2 +
 .../tasks/audio-classification.md             |  10 +-
 .../tasks/automatic-speech-recognition.md     |   5 +-
 docs/api-inference/tasks/chat-completion.md   | 106 ++++++++++++++--
 .../api-inference/tasks/feature-extraction.md |   3 +-
 docs/api-inference/tasks/fill-mask.md         |   3 +-
 .../tasks/image-classification.md             |   4 +-
 .../api-inference/tasks/image-segmentation.md |   3 +-
 .../api-inference/tasks/image-text-to-text.md | 115 ++++++++++++++++++
 docs/api-inference/tasks/image-to-image.md    |   2 +-
 docs/api-inference/tasks/object-detection.md  |   3 +-
 .../api-inference/tasks/question-answering.md |   3 +-
 docs/api-inference/tasks/summarization.md     |   3 +-
 .../tasks/table-question-answering.md         |   3 +-
 .../tasks/text-classification.md              |   4 +-
 docs/api-inference/tasks/text-generation.md   |   3 +-
 docs/api-inference/tasks/text-to-image.md     |   4 +-
 .../tasks/token-classification.md             |   3 +-
 docs/api-inference/tasks/translation.md       |   3 +-
 .../tasks/zero-shot-classification.md         |   4 +-
 scripts/api-inference/package.json            |   2 +-
 scripts/api-inference/scripts/generate.ts     |  87 ++++++++-----
 .../templates/task/chat-completion.handlebars |  20 ++-
 .../task/image-text-to-text.handlebars        |  23 ++++
 24 files changed, 332 insertions(+), 86 deletions(-)
 create mode 100644 docs/api-inference/tasks/image-text-to-text.md
 create mode 100644 scripts/api-inference/templates/task/image-text-to-text.handlebars
diff --git a/docs/api-inference/_toctree.yml b/docs/api-inference/_toctree.yml
index 123f62ca4..f19c04503 100644
--- a/docs/api-inference/_toctree.yml
+++ b/docs/api-inference/_toctree.yml
@@ -30,6 +30,8 @@
       title: Image Segmentation
     - local: tasks/image-to-image
       title: Image to Image
+    - local: tasks/image-text-to-text
+      title: Image-Text to Text
     - local: tasks/object-detection
       title: Object Detection
     - local: tasks/question-answering
diff --git a/docs/api-inference/tasks/audio-classification.md b/docs/api-inference/tasks/audio-classification.md
index b752e9ee3..f91a18dc3 100644
--- a/docs/api-inference/tasks/audio-classification.md
+++ b/docs/api-inference/tasks/audio-classification.md
@@ -29,8 +29,9 @@ For more details about the `audio-classification` task, check out its [dedicated
 
 ### Recommended models
 
+- [ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition](https://huggingface.co/ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition): An emotion recognition model.
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=audio-classification&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=audio-classification&sort=trending).
 
 ### Using the API
 
@@ -39,11 +40,10 @@ This is only a subset of the supported models. Find the model that suits you bes
 
 <curl>
 ```bash
-curl https://api-inference.huggingface.co/models/<REPO_ID> \
+curl https://api-inference.huggingface.co/models/ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition \
 	-X POST \
 	--data-binary '@sample1.flac' \
 	-H "Authorization: Bearer hf_***"
-
 ```
 </curl>
 
@@ -51,7 +51,7 @@ curl https://api-inference.huggingface.co/models/<REPO_ID> \
 ```py
 import requests
 
-API_URL = "https://api-inference.huggingface.co/models/<REPO_ID>"
+API_URL = "https://api-inference.huggingface.co/models/ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
 headers = {"Authorization": "Bearer hf_***"}
 
 def query(filename):
@@ -71,7 +71,7 @@ To use the Python client, see `huggingface_hub`'s [package reference](https://hu
 async function query(filename) {
 	const data = fs.readFileSync(filename);
 	const response = await fetch(
-		"https://api-inference.huggingface.co/models/<REPO_ID>",
+		"https://api-inference.huggingface.co/models/ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
 		{
 			headers: {
 				Authorization: "Bearer hf_***"
diff --git a/docs/api-inference/tasks/automatic-speech-recognition.md b/docs/api-inference/tasks/automatic-speech-recognition.md
index af43524c5..819bbf862 100644
--- a/docs/api-inference/tasks/automatic-speech-recognition.md
+++ b/docs/api-inference/tasks/automatic-speech-recognition.md
@@ -32,7 +32,7 @@ For more details about the `automatic-speech-recognition` task, check out its [d
 - [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3): A powerful ASR model by OpenAI.
 - [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1): Powerful speaker diarization model.
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=automatic-speech-recognition&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=automatic-speech-recognition&sort=trending).
 
 ### Using the API
 
@@ -45,7 +45,6 @@ curl https://api-inference.huggingface.co/models/openai/whisper-large-v3 \
 	-X POST \
 	--data-binary '@sample1.flac' \
 	-H "Authorization: Bearer hf_***"
-
 ```
 </curl>
 
@@ -108,7 +107,7 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 | **inputs*** | _string_ | The input audio data as a base64-encoded string. If no `parameters` are provided, you can also provide the audio data as a raw bytes payload. |
 | **parameters** | _object_ | Additional inference parameters for Automatic Speech Recognition |
 | **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;return_timestamps** | _boolean_ | Whether to output corresponding timestamps with the generated text |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;generate** | _object_ | Ad-hoc parametrization of the text generation process |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;generation_parameters** | _object_ | Ad-hoc parametrization of the text generation process |
 | **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;temperature** | _number_ | The value used to modulate the next token probabilities. |
 | **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;top_k** | _integer_ | The number of highest probability vocabulary tokens to keep for top-k-filtering. |
 | **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;top_p** | _number_ | If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. |
diff --git a/docs/api-inference/tasks/chat-completion.md b/docs/api-inference/tasks/chat-completion.md
index 367be55db..7acba716b 100644
--- a/docs/api-inference/tasks/chat-completion.md
+++ b/docs/api-inference/tasks/chat-completion.md
@@ -14,20 +14,23 @@ For more details, check out:
 
 ## Chat Completion
 
-Generate a response given a list of messages.
-This is a subtask of [`text-generation`](./text_generation) designed to generate responses in a conversational context.
-
-
+Generate a response given a list of messages in a conversational context, supporting both conversational Language Models (LLMs) and conversational Vision-Language Models (VLMs).
+This is a subtask of [`text-generation`](https://huggingface.co/docs/api-inference/tasks/text-generation) and [`image-text-to-text`](https://huggingface.co/docs/api-inference/tasks/image-text-to-text).
 
 ### Recommended models
 
+#### Conversational Large Language Models (LLMs)
+
 - [google/gemma-2-2b-it](https://huggingface.co/google/gemma-2-2b-it): A text-generation model trained to follow instructions.
 - [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct): Very powerful text generation model trained to follow instructions.
 - [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct): Small yet powerful text generation model.
 - [HuggingFaceH4/starchat2-15b-v0.1](https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1): Strong coding assistant model.
 - [mistralai/Mistral-Nemo-Instruct-2407](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407): Very strong open-source large language model.
 
+#### Conversational Vision-Language Models (VLMs)
 
+- [meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct): Powerful vision language model with great visual understanding and reasoning capabilities.
+- [microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct): Strong image-text-to-text model.
 
 ### Using the API
 
@@ -37,6 +40,8 @@ The API supports:
 * Using grammars, constraints, and tools.
 * Streaming the output
 
+#### Code snippet example for conversational LLMs
+
 
 <inferencesnippet>
 
@@ -59,18 +64,15 @@ curl 'https://api-inference.huggingface.co/models/google/gemma-2-2b-it/v1/chat/c
 ```py
 from huggingface_hub import InferenceClient
 
-client = InferenceClient(
-    "google/gemma-2-2b-it",
-    token="hf_***",
-)
+client = InferenceClient(api_key="hf_***")
 
 for message in client.chat_completion(
+	model="google/gemma-2-2b-it",
 	messages=[{"role": "user", "content": "What is the capital of France?"}],
 	max_tokens=500,
 	stream=True,
 ):
     print(message.choices[0].delta.content, end="")
-
 ```
 
 To use the Python client, see `huggingface_hub`'s [package reference](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion).
@@ -89,7 +91,93 @@ for await (const chunk of inference.chatCompletionStream({
 })) {
 	process.stdout.write(chunk.choices[0]?.delta?.content || "");
 }
+```
+
+To use the JavaScript client, see `huggingface.js`'s [package reference](https://huggingface.co/docs/huggingface.js/inference/classes/HfInference#chatcompletion).
+</js>
+
+</inferencesnippet>
+
+
+
+#### Code snippet example for conversational VLMs
 
+
+<inferencesnippet>
+
+<curl>
+```bash
+curl 'https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-11B-Vision-Instruct/v1/chat/completions' \
+-H "Authorization: Bearer hf_***" \
+-H 'Content-Type: application/json' \
+-d '{
+	"model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+	"messages": [
+		{
+			"role": "user",
+			"content": [
+				{"type": "image_url", "image_url": {"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"}},
+				{"type": "text", "text": "Describe this image in one sentence."}
+			]
+		}
+	],
+	"max_tokens": 500,
+	"stream": false
+}'
+
+```
+</curl>
+
+<python>
+```py
+from huggingface_hub import InferenceClient
+
+client = InferenceClient(api_key="hf_***")
+
+image_url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+
+for message in client.chat_completion(
+	model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+	messages=[
+		{
+			"role": "user",
+			"content": [
+				{"type": "image_url", "image_url": {"url": image_url}},
+				{"type": "text", "text": "Describe this image in one sentence."},
+			],
+		}
+	],
+	max_tokens=500,
+	stream=True,
+):
+	print(message.choices[0].delta.content, end="")
+```
+
+To use the Python client, see `huggingface_hub`'s [package reference](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion).
+</python>
+
+<js>
+```js
+import { HfInference } from "@huggingface/inference";
+
+const inference = new HfInference("hf_***");
+const imageUrl = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg";
+
+for await (const chunk of inference.chatCompletionStream({
+	model: "meta-llama/Llama-3.2-11B-Vision-Instruct",
+	messages: [
+		{
+			"role": "user",
+			"content": [
+				{"type": "image_url", "image_url": {"url": imageUrl}},
+				{"type": "text", "text": "Describe this image in one sentence."},
+			],
+		}
+	],
+	max_tokens: 500,
+})) {
+	process.stdout.write(chunk.choices[0]?.delta?.content || "");
+}
 ```
 
 To use the JavaScript client, see `huggingface.js`'s [package reference](https://huggingface.co/docs/huggingface.js/inference/classes/HfInference#chatcompletion).
diff --git a/docs/api-inference/tasks/feature-extraction.md b/docs/api-inference/tasks/feature-extraction.md
index 6eb99703f..b9d0fb312 100644
--- a/docs/api-inference/tasks/feature-extraction.md
+++ b/docs/api-inference/tasks/feature-extraction.md
@@ -31,7 +31,7 @@ For more details about the `feature-extraction` task, check out its [dedicated p
 
 - [thenlper/gte-large](https://huggingface.co/thenlper/gte-large): A powerful feature extraction model for natural language processing tasks.
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=feature-extraction&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=feature-extraction&sort=trending).
 
 ### Using the API
 
@@ -45,7 +45,6 @@ curl https://api-inference.huggingface.co/models/thenlper/gte-large \
 	-d '{"inputs": "Today is a sunny day and I will get some ice cream."}' \
 	-H 'Content-Type: application/json' \
 	-H "Authorization: Bearer hf_***"
-
 ```
 </curl>
 
diff --git a/docs/api-inference/tasks/fill-mask.md b/docs/api-inference/tasks/fill-mask.md
index d25591df6..b4c07c07a 100644
--- a/docs/api-inference/tasks/fill-mask.md
+++ b/docs/api-inference/tasks/fill-mask.md
@@ -27,7 +27,7 @@ For more details about the `fill-mask` task, check out its [dedicated page](http
 - [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased): The famous BERT model.
 - [FacebookAI/xlm-roberta-base](https://huggingface.co/FacebookAI/xlm-roberta-base): A multilingual model trained on 100 languages.
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=fill-mask&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=fill-mask&sort=trending).
 
 ### Using the API
 
@@ -41,7 +41,6 @@ curl https://api-inference.huggingface.co/models/google-bert/bert-base-uncased \
 	-d '{"inputs": "The answer to the universe is [MASK]."}' \
 	-H 'Content-Type: application/json' \
 	-H "Authorization: Bearer hf_***"
-
 ```
 </curl>
 
diff --git a/docs/api-inference/tasks/image-classification.md b/docs/api-inference/tasks/image-classification.md
index 53f5f734f..ce5ad7192 100644
--- a/docs/api-inference/tasks/image-classification.md
+++ b/docs/api-inference/tasks/image-classification.md
@@ -25,8 +25,9 @@ For more details about the `image-classification` task, check out its [dedicated
 ### Recommended models
 
 - [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224): A strong image classification model.
+- [facebook/deit-base-distilled-patch16-224](https://huggingface.co/facebook/deit-base-distilled-patch16-224): A robust image classification model.
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=image-classification&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=image-classification&sort=trending).
 
 ### Using the API
 
@@ -39,7 +40,6 @@ curl https://api-inference.huggingface.co/models/google/vit-base-patch16-224 \
 	-X POST \
 	--data-binary '@cats.jpg' \
 	-H "Authorization: Bearer hf_***"
-
 ```
 </curl>
 
diff --git a/docs/api-inference/tasks/image-segmentation.md b/docs/api-inference/tasks/image-segmentation.md
index 367e4b397..90017e6f6 100644
--- a/docs/api-inference/tasks/image-segmentation.md
+++ b/docs/api-inference/tasks/image-segmentation.md
@@ -26,7 +26,7 @@ For more details about the `image-segmentation` task, check out its [dedicated p
 
 - [nvidia/segformer-b0-finetuned-ade-512-512](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512): Semantic segmentation model trained on ADE20k benchmark dataset with 512x512 resolution.
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=image-segmentation&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=image-segmentation&sort=trending).
 
 ### Using the API
 
@@ -39,7 +39,6 @@ curl https://api-inference.huggingface.co/models/nvidia/segformer-b0-finetuned-a
 	-X POST \
 	--data-binary '@cats.jpg' \
 	-H "Authorization: Bearer hf_***"
-
 ```
 </curl>
 
diff --git a/docs/api-inference/tasks/image-text-to-text.md b/docs/api-inference/tasks/image-text-to-text.md
new file mode 100644
index 000000000..bacc08dac
--- /dev/null
+++ b/docs/api-inference/tasks/image-text-to-text.md
@@ -0,0 +1,115 @@
+<!---
+This markdown file has been generated from a script. Please do not edit it directly.
+For more details, check out:
+- the `generate.ts` script: https://github.com/huggingface/hub-docs/blob/main/scripts/api-inference/scripts/generate.ts
+- the task template defining the sections in the page: https://github.com/huggingface/hub-docs/tree/main/scripts/api-inference/templates/task/image-text-to-text.handlebars
+- the input jsonschema specifications used to generate the input markdown table: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/tasks/image-text-to-text/spec/input.json
+- the output jsonschema specifications used to generate the output markdown table: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/tasks/image-text-to-text/spec/output.json
+- the snippets used to generate the example:
+  - curl: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/snippets/curl.ts
+  - python: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/snippets/python.ts
+  - javascript: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/snippets/js.ts
+- the "tasks" content for recommended models: https://huggingface.co/api/tasks
+--->
+
+## Image-Text to Text
+
+Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.
+
+<Tip>
+
+For more details about the `image-text-to-text` task, check out its [dedicated page](https://huggingface.co/tasks/image-text-to-text)! You will find examples and related materials.
+
+</Tip>
+
+### Recommended models
+
+- [meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct): Powerful vision language model with great visual understanding and reasoning capabilities.
+- [HuggingFaceM4/idefics2-8b-chatty](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty): Cutting-edge conversational vision language model that can take multiple image inputs.
+- [microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct): Strong image-text-to-text model.
+
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=image-text-to-text&sort=trending).
+
+### Using the API
+
+
+<inferencesnippet>
+
+<curl>
+```bash
+curl https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-11B-Vision-Instruct \
+	-X POST \
+	-d '{"inputs": No input example has been defined for this model task.}' \
+	-H 'Content-Type: application/json' \
+	-H "Authorization: Bearer hf_***"
+```
+</curl>
+
+<python>
+```py
+import requests
+
+API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-11B-Vision-Instruct"
+headers = {"Authorization": "Bearer hf_***"}
+
+from huggingface_hub import InferenceClient
+
+client = InferenceClient(api_key="hf_***")
+
+image_url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+
+for message in client.chat_completion(
+	model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+	messages=[
+		{
+			"role": "user",
+			"content": [
+				{"type": "image_url", "image_url": {"url": image_url}},
+				{"type": "text", "text": "Describe this image in one sentence."},
+			],
+		}
+	],
+	max_tokens=500,
+	stream=True,
+):
+	print(message.choices[0].delta.content, end="")
+```
+
+To use the Python client, see `huggingface_hub`'s [package reference](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.image_text-to-text).
+</python>
+
+<js>
+```js
+async function query(data) {
+	const response = await fetch(
+		"https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-11B-Vision-Instruct",
+		{
+			headers: {
+				Authorization: "Bearer hf_***"
+				"Content-Type": "application/json",
+			},
+			method: "POST",
+			body: JSON.stringify(data),
+		}
+	);
+	const result = await response.json();
+	return result;
+}
+
+query({"inputs": No input example has been defined for this model task.}).then((response) => {
+	console.log(JSON.stringify(response));
+});
+```
+
+To use the JavaScript client, see `huggingface.js`'s [package reference](https://huggingface.co/docs/huggingface.js/inference/classes/HfInference#imagetext-to-text).
+</js>
+
+</inferencesnippet>
+
+
+
+### API specification
+
+For the API specification of conversational image-text-to-text models, please refer to the [Chat Completion API documentation](https://huggingface.co/docs/api-inference/tasks/chat-completion#api-specification).
+
+
diff --git a/docs/api-inference/tasks/image-to-image.md b/docs/api-inference/tasks/image-to-image.md
index 7b5cfaad4..1c2f277b3 100644
--- a/docs/api-inference/tasks/image-to-image.md
+++ b/docs/api-inference/tasks/image-to-image.md
@@ -31,7 +31,7 @@ For more details about the `image-to-image` task, check out its [dedicated page]
 
 - [timbrooks/instruct-pix2pix](https://huggingface.co/timbrooks/instruct-pix2pix): A model that takes an image and an instruction to edit the image.
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=image-to-image&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=image-to-image&sort=trending).
 
 ### Using the API
 
diff --git a/docs/api-inference/tasks/object-detection.md b/docs/api-inference/tasks/object-detection.md
index fc8d989c1..b01776f25 100644
--- a/docs/api-inference/tasks/object-detection.md
+++ b/docs/api-inference/tasks/object-detection.md
@@ -27,7 +27,7 @@ For more details about the `object-detection` task, check out its [dedicated pag
 - [facebook/detr-resnet-50](https://huggingface.co/facebook/detr-resnet-50): Solid object detection model trained on the benchmark dataset COCO 2017.
 - [microsoft/beit-base-patch16-224-pt22k-ft22k](https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k): Strong object detection model trained on ImageNet-21k dataset.
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=object-detection&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=object-detection&sort=trending).
 
 ### Using the API
 
@@ -40,7 +40,6 @@ curl https://api-inference.huggingface.co/models/facebook/detr-resnet-50 \
 	-X POST \
 	--data-binary '@cats.jpg' \
 	-H "Authorization: Bearer hf_***"
-
 ```
 </curl>
 
diff --git a/docs/api-inference/tasks/question-answering.md b/docs/api-inference/tasks/question-answering.md
index 73ccfa13b..1a2beb24f 100644
--- a/docs/api-inference/tasks/question-answering.md
+++ b/docs/api-inference/tasks/question-answering.md
@@ -27,7 +27,7 @@ For more details about the `question-answering` task, check out its [dedicated p
 - [deepset/roberta-base-squad2](https://huggingface.co/deepset/roberta-base-squad2): A robust baseline model for most question answering domains.
 - [distilbert/distilbert-base-cased-distilled-squad](https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad): Small yet robust model that can answer questions.
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=question-answering&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=question-answering&sort=trending).
 
 ### Using the API
 
@@ -41,7 +41,6 @@ curl https://api-inference.huggingface.co/models/deepset/roberta-base-squad2 \
 	-d '{"inputs": { "question": "What is my name?", "context": "My name is Clara and I live in Berkeley." }}' \
 	-H 'Content-Type: application/json' \
 	-H "Authorization: Bearer hf_***"
-
 ```
 </curl>
 
diff --git a/docs/api-inference/tasks/summarization.md b/docs/api-inference/tasks/summarization.md
index c10a1828b..6a46c6d6a 100644
--- a/docs/api-inference/tasks/summarization.md
+++ b/docs/api-inference/tasks/summarization.md
@@ -26,7 +26,7 @@ For more details about the `summarization` task, check out its [dedicated page](
 
 - [facebook/bart-large-cnn](https://huggingface.co/facebook/bart-large-cnn): A strong summarization model trained on English news articles. Excels at generating factual summaries.
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=summarization&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=summarization&sort=trending).
 
 ### Using the API
 
@@ -40,7 +40,6 @@ curl https://api-inference.huggingface.co/models/facebook/bart-large-cnn \
 	-d '{"inputs": "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct."}' \
 	-H 'Content-Type: application/json' \
 	-H "Authorization: Bearer hf_***"
-
 ```
 </curl>
 
diff --git a/docs/api-inference/tasks/table-question-answering.md b/docs/api-inference/tasks/table-question-answering.md
index 3eb659892..75387053c 100644
--- a/docs/api-inference/tasks/table-question-answering.md
+++ b/docs/api-inference/tasks/table-question-answering.md
@@ -25,7 +25,7 @@ For more details about the `table-question-answering` task, check out its [dedic
 ### Recommended models
 
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=table-question-answering&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=table-question-answering&sort=trending).
 
 ### Using the API
 
@@ -39,7 +39,6 @@ curl https://api-inference.huggingface.co/models/<REPO_ID> \
 	-d '{"inputs": { "query": "How many stars does the transformers repository have?", "table": { "Repository": ["Transformers", "Datasets", "Tokenizers"], "Stars": ["36542", "4512", "3934"], "Contributors": ["651", "77", "34"], "Programming language": [ "Python", "Python", "Rust, Python and NodeJS" ] } }}' \
 	-H 'Content-Type: application/json' \
 	-H "Authorization: Bearer hf_***"
-
 ```
 </curl>
 
diff --git a/docs/api-inference/tasks/text-classification.md b/docs/api-inference/tasks/text-classification.md
index d014a40f2..96640c5b0 100644
--- a/docs/api-inference/tasks/text-classification.md
+++ b/docs/api-inference/tasks/text-classification.md
@@ -28,8 +28,9 @@ For more details about the `text-classification` task, check out its [dedicated
 - [ProsusAI/finbert](https://huggingface.co/ProsusAI/finbert): A sentiment analysis model specialized in financial sentiment.
 - [cardiffnlp/twitter-roberta-base-sentiment-latest](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest): A sentiment analysis model specialized in analyzing tweets.
 - [papluca/xlm-roberta-base-language-detection](https://huggingface.co/papluca/xlm-roberta-base-language-detection): A model that can classify languages.
+- [meta-llama/Prompt-Guard-86M](https://huggingface.co/meta-llama/Prompt-Guard-86M): A model that can classify text generation attacks.
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=text-classification&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=text-classification&sort=trending).
 
 ### Using the API
 
@@ -43,7 +44,6 @@ curl https://api-inference.huggingface.co/models/distilbert/distilbert-base-unca
 	-d '{"inputs": "I like you. I love you"}' \
 	-H 'Content-Type: application/json' \
 	-H "Authorization: Bearer hf_***"
-
 ```
 </curl>
 
diff --git a/docs/api-inference/tasks/text-generation.md b/docs/api-inference/tasks/text-generation.md
index e69e96dc0..7e315ddc4 100644
--- a/docs/api-inference/tasks/text-generation.md
+++ b/docs/api-inference/tasks/text-generation.md
@@ -33,7 +33,7 @@ For more details about the `text-generation` task, check out its [dedicated page
 - [HuggingFaceH4/starchat2-15b-v0.1](https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1): Strong coding assistant model.
 - [mistralai/Mistral-Nemo-Instruct-2407](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407): Very strong open-source large language model.
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=text-generation&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=text-generation&sort=trending).
 
 ### Using the API
 
@@ -47,7 +47,6 @@ curl https://api-inference.huggingface.co/models/google/gemma-2-2b-it \
 	-d '{"inputs": "Can you please let us know more details about your "}' \
 	-H 'Content-Type: application/json' \
 	-H "Authorization: Bearer hf_***"
-
 ```
 </curl>
 
diff --git a/docs/api-inference/tasks/text-to-image.md b/docs/api-inference/tasks/text-to-image.md
index ec719cba0..df2bb4d2c 100644
--- a/docs/api-inference/tasks/text-to-image.md
+++ b/docs/api-inference/tasks/text-to-image.md
@@ -25,10 +25,9 @@ For more details about the `text-to-image` task, check out its [dedicated page](
 ### Recommended models
 
 - [black-forest-labs/FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev): One of the most powerful image generation models that can generate realistic outputs.
-- [latent-consistency/lcm-lora-sdxl](https://huggingface.co/latent-consistency/lcm-lora-sdxl): A powerful yet fast image generation model.
 - [stabilityai/stable-diffusion-3-medium-diffusers](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers): A powerful text-to-image model.
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=text-to-image&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=text-to-image&sort=trending).
 
 ### Using the API
 
@@ -42,7 +41,6 @@ curl https://api-inference.huggingface.co/models/black-forest-labs/FLUX.1-dev \
 	-d '{"inputs": "Astronaut riding a horse"}' \
 	-H 'Content-Type: application/json' \
 	-H "Authorization: Bearer hf_***"
-
 ```
 </curl>
 
diff --git a/docs/api-inference/tasks/token-classification.md b/docs/api-inference/tasks/token-classification.md
index 035582250..d1055343a 100644
--- a/docs/api-inference/tasks/token-classification.md
+++ b/docs/api-inference/tasks/token-classification.md
@@ -29,7 +29,7 @@ For more details about the `token-classification` task, check out its [dedicated
 - [blaze999/Medical-NER](https://huggingface.co/blaze999/Medical-NER): A token classification model specialized on medical entity recognition.
 - [flair/ner-english](https://huggingface.co/flair/ner-english): Flair models are typically the state of the art in named entity recognition tasks.
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=token-classification&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=token-classification&sort=trending).
 
 ### Using the API
 
@@ -43,7 +43,6 @@ curl https://api-inference.huggingface.co/models/dslim/bert-base-NER \
 	-d '{"inputs": "My name is Sarah Jessica Parker but you can call me Jessica"}' \
 	-H 'Content-Type: application/json' \
 	-H "Authorization: Bearer hf_***"
-
 ```
 </curl>
 
diff --git a/docs/api-inference/tasks/translation.md b/docs/api-inference/tasks/translation.md
index 908aa972e..18263b71e 100644
--- a/docs/api-inference/tasks/translation.md
+++ b/docs/api-inference/tasks/translation.md
@@ -26,7 +26,7 @@ For more details about the `translation` task, check out its [dedicated page](ht
 
 - [google-t5/t5-base](https://huggingface.co/google-t5/t5-base): A general-purpose Transformer that can be used to translate from English to German, French, or Romanian.
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=translation&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=translation&sort=trending).
 
 ### Using the API
 
@@ -40,7 +40,6 @@ curl https://api-inference.huggingface.co/models/google-t5/t5-base \
 	-d '{"inputs": "Меня зовут Вольфганг и я живу в Берлине"}' \
 	-H 'Content-Type: application/json' \
 	-H "Authorization: Bearer hf_***"
-
 ```
 </curl>
 
diff --git a/docs/api-inference/tasks/zero-shot-classification.md b/docs/api-inference/tasks/zero-shot-classification.md
index 7ccf024aa..8401bcf93 100644
--- a/docs/api-inference/tasks/zero-shot-classification.md
+++ b/docs/api-inference/tasks/zero-shot-classification.md
@@ -25,9 +25,8 @@ For more details about the `zero-shot-classification` task, check out its [dedic
 ### Recommended models
 
 - [facebook/bart-large-mnli](https://huggingface.co/facebook/bart-large-mnli): Powerful zero-shot text classification model.
-- [MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7](https://huggingface.co/MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7): Powerful zero-shot multilingual text classification model that can accomplish multiple tasks.
 
-This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=zero-shot-classification&sort=trending).
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=zero-shot-classification&sort=trending).
 
 ### Using the API
 
@@ -41,7 +40,6 @@ curl https://api-inference.huggingface.co/models/facebook/bart-large-mnli \
 	-d '{"inputs": "Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!", "parameters": {"candidate_labels": ["refund", "legal", "faq"]}}' \
 	-H 'Content-Type: application/json' \
 	-H "Authorization: Bearer hf_***"
-
 ```
 </curl>
 
diff --git a/scripts/api-inference/package.json b/scripts/api-inference/package.json
index 13f84e881..c8e317ca5 100644
--- a/scripts/api-inference/package.json
+++ b/scripts/api-inference/package.json
@@ -13,7 +13,7 @@
   "author": "",
   "license": "ISC",
   "dependencies": {
-    "@huggingface/tasks": "^0.11.11",
+    "@huggingface/tasks": "^0.12.15",
     "@types/node": "^22.5.0",
     "handlebars": "^4.7.8",
     "node": "^20.17.0",
diff --git a/scripts/api-inference/scripts/generate.ts b/scripts/api-inference/scripts/generate.ts
index 49657e1ed..51997f008 100644
--- a/scripts/api-inference/scripts/generate.ts
+++ b/scripts/api-inference/scripts/generate.ts
@@ -1,4 +1,4 @@
-import { snippets, PipelineType } from "@huggingface/tasks";
+import { PipelineType, snippets } from "@huggingface/tasks";
 import Handlebars from "handlebars";
 import * as fs from "node:fs/promises";
 import * as path from "node:path/posix";
@@ -12,6 +12,7 @@ const TASKS: PipelineType[] = [
   "image-classification",
   "image-segmentation",
   "image-to-image",
+  "image-text-to-text",
   "object-detection",
   "question-answering",
   "summarization",
@@ -121,8 +122,11 @@ export function getInferenceSnippet(
     mask_token: "[MASK]",
     library_name: "",
     config: {},
+    tags: [],
   };
+  // @ts-ignore
   if (HAS_SNIPPET_FN[language](modelData)) {
+    // @ts-ignore
     return GET_SNIPPET_FN[language](modelData, "hf_***");
   }
 }
@@ -314,9 +318,8 @@ For more details about the \`{{task}}\` task, check out its [dedicated page](htt
 </Tip>`);
 
 const TIP_LIST_MODELS_LINK_TEMPLATE = Handlebars.compile(
-  `This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag={{task}}&sort=trending).`,
+  `Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag={{task}}&sort=trending).`,
 );
-
 const SPECS_HEADERS = await readTemplate("specs-headers", "common");
 const PAGE_HEADER = Handlebars.compile(
   await readTemplate("page-header", "common"),
@@ -376,7 +379,7 @@ await Promise.all(
         }) => {
           console.log(`   ⚡ Checking inference status ${model.id}`);
           let url = `https://huggingface.co/api/models/${model.id}?expand[]=inference`;
-          if (task === "text-generation") {
+          if (task === "text-generation" || task === "image-text-to-text") {
             url += "&expand[]=config";
           }
           const modelData = await fetch(url).then((res) => res.json());
@@ -414,6 +417,7 @@ TASKS.forEach((task) => {
   });
 });
 
+
 // Render specs
 await Promise.all(
   TASKS_EXTENDED.map(async (task) => {
@@ -446,35 +450,54 @@ TASKS.forEach((task) => {
 ///////////////////////////////////////////////
 
 function fetchChatCompletion() {
-  // Recommended models based on text-generation
-  DATA.models["chat-completion"] = DATA.models["text-generation"].filter(
-    // @ts-ignore
-    (model) => model.config?.tokenizer_config?.chat_template,
-  );
+  const baseName = "chat-completion";
+  const conversationalTasks = [
+    {
+      name: "chat-completion",
+      baseName: "text-generation",
+      pipelineTag: "text-generation"
+    },
+    {
+      name: "conversational-image-text-to-text",
+      baseName: "image-text-to-text",
+      pipelineTag: "image-text-to-text"
+    }
+  ];
+
+  conversationalTasks.forEach(task => {
+    // Recommended models based on the base task
+    DATA.models[task.name] = DATA.models[task.baseName].filter(
+      // @ts-ignore
+      (model) => model.config?.tokenizer_config?.chat_template,
+    );
+
+    const mainModel = DATA.models[task.name][0];
+    const mainModelData = {
+      // @ts-ignore
+      id: mainModel.id,
+      pipeline_tag: task.pipelineTag,
+      mask_token: "",
+      library_name: "",
+      // @ts-ignore
+      tags: ["conversational"],
+      // @ts-ignore
+      config: mainModel.config,
+    };
+
+    const taskSnippets = {
+      // @ts-ignore
+      curl: GET_SNIPPET_FN["curl"](mainModelData, "hf_***"),
+      // @ts-ignore
+      python: GET_SNIPPET_FN["python"](mainModelData, "hf_***"),
+      // @ts-ignore
+      javascript: GET_SNIPPET_FN["js"](mainModelData, "hf_***"),
+    };
+    DATA.snippets[task.name] = SNIPPETS_TEMPLATE({
+      taskSnippets,
+      taskSnakeCase: baseName.replace("-", "_"),
+      taskAttached: baseName.replace("-", ""),
+    });
 
-  // Snippet specific to chat completion
-  const mainModel = DATA.models["chat-completion"][0];
-  const mainModelData = {
-    // @ts-ignore
-    id: mainModel.id,
-    pipeline_tag: "text-generation",
-    mask_token: "",
-    library_name: "",
-    // @ts-ignore
-    config: mainModel.config,
-  };
-  const taskSnippets = {
-    // @ts-ignore
-    curl: GET_SNIPPET_FN["curl"](mainModelData, "hf_***"),
-    // @ts-ignore
-    python: GET_SNIPPET_FN["python"](mainModelData, "hf_***"),
-    // @ts-ignore
-    javascript: GET_SNIPPET_FN["js"](mainModelData, "hf_***"),
-  };
-  DATA.snippets["chat-completion"] = SNIPPETS_TEMPLATE({
-    taskSnippets,
-    taskSnakeCase: "chat-completion".replace("-", "_"),
-    taskAttached: "chat-completion".replace("-", ""),
   });
 }
 
diff --git a/scripts/api-inference/templates/task/chat-completion.handlebars b/scripts/api-inference/templates/task/chat-completion.handlebars
index 31acb2d21..ddeff3e4d 100644
--- a/scripts/api-inference/templates/task/chat-completion.handlebars
+++ b/scripts/api-inference/templates/task/chat-completion.handlebars
@@ -1,17 +1,21 @@
 ## Chat Completion
 
-Generate a response given a list of messages.
-This is a subtask of [`text-generation`](./text_generation) designed to generate responses in a conversational context.
-
-{{{tips.linksToTaskPage.chat-completion}}}
+Generate a response given a list of messages in a conversational context, supporting both conversational Language Models (LLMs) and conversational Vision-Language Models (VLMs).
+This is a subtask of [`text-generation`](https://huggingface.co/docs/api-inference/tasks/text-generation) and [`image-text-to-text`](https://huggingface.co/docs/api-inference/tasks/image-text-to-text).
 
 ### Recommended models
 
+#### Conversational Large Language Models (LLMs)
+
 {{#each models.chat-completion}}
 - [{{this.id}}](https://huggingface.co/{{this.id}}): {{this.description}}
 {{/each}}
 
-{{{tips.listModelsLink.chat-completion}}}
+#### Conversational Vision-Language Models (VLMs)
+
+{{#each models.conversational-image-text-to-text}}
+- [{{this.id}}](https://huggingface.co/{{this.id}}): {{this.description}}
+{{/each}}
 
 ### Using the API
 
@@ -21,8 +25,14 @@ The API supports:
 * Using grammars, constraints, and tools.
 * Streaming the output
 
+#### Code snippet example for conversational LLMs
+
 {{{snippets.chat-completion}}}
 
+#### Code snippet example for conversational VLMs
+
+{{{snippets.conversational-image-text-to-text}}}
+
 ### API specification
 
 #### Request
diff --git a/scripts/api-inference/templates/task/image-text-to-text.handlebars b/scripts/api-inference/templates/task/image-text-to-text.handlebars
new file mode 100644
index 000000000..8aa03f37e
--- /dev/null
+++ b/scripts/api-inference/templates/task/image-text-to-text.handlebars
@@ -0,0 +1,23 @@
+## Image-Text to Text
+
+Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.
+
+{{{tips.linksToTaskPage.image-text-to-text}}}
+
+### Recommended models
+
+{{#each models.image-text-to-text}}
+- [{{this.id}}](https://huggingface.co/{{this.id}}): {{this.description}}
+{{/each}}
+
+{{{tips.listModelsLink.image-text-to-text}}}
+
+### Using the API
+
+{{{snippets.image-text-to-text}}}
+
+### API specification
+
+For the API specification of conversational image-text-to-text models, please refer to the [Chat Completion API documentation](https://huggingface.co/docs/api-inference/tasks/chat-completion#api-specification).
+
+