{
		"name": "@huggingface/tasks",
		"packageManager": "pnpm@8.10.5",
		"version": "0.1.0",
		"version": "0.1.1",
		"description": "List of ML tasks for huggingface.co/tasks",
		@@ -6,0 +6,0 @@ "repository": "https://github.com/huggingface/huggingface.js.git",

src/library-ui-elements.ts

		@@ -75,3 +75,3 @@ import type { ModelData } from "./model-data";
		function get_base_diffusers_model(model: ModelData): string {
		return model.cardData?.base_model ?? "fill-in-base-model";
		return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
		}
		@@ -78,0 +78,0 @@

src/model-data.ts

		@@ -96,3 +96,3 @@ import type { PipelineType } from "./pipelines";
		};
		base_model?: string;
		base_model?: string \| string[];
		};
		@@ -99,0 +99,0 @@ /**

src/tasks/depth-estimation/data.ts

		@@ -27,3 +27,2 @@ import type { TaskDataCustom } from "..";
		{
		// TO DO: write description
		description: "Strong Depth Estimation model trained on 1.4 million images.",
		@@ -33,6 +32,9 @@ id: "Intel/dpt-large",
		{
		// TO DO: write description
		description: "Strong Depth Estimation model trained on the KITTI dataset.",
		id: "vinvino02/glpn-kitti",
		id: "facebook/dpt-dinov2-large-kitti",
		},
		{
		description: "A strong monocular depth estimation model.",
		id: "Bingxin/Marigold",
		},
		],
		@@ -39,0 +41,0 @@ spaces: [

src/tasks/document-question-answering/data.ts

		@@ -53,2 +53,6 @@ import type { TaskDataCustom } from "..";
		},
		{
		description: "A powerful model for document question answering.",
		id: "google/pix2struct-docvqa-large",
		},
		],
		@@ -64,2 +68,6 @@ spaces: [
		},
		{
		description: "An application to compare different document question answering models.",
		id: "merve/compare_docvqa_models",
		},
		],
		@@ -66,0 +74,0 @@ summary:

src/tasks/image-to-text/about.md

		@@ -30,2 +30,15 @@ ## Use Cases

		### Conversation about the Image

		Some text generation models also take image inputs. These are called vision language models. You can use `image-to-text` pipeline to use these models like below.

		```python
		from transformers import pipeline

		mm_pipeline = pipeline("image-to-text",model="llava-hf/llava-1.5-7b-hf")
		mm_pipeline("https://huggingface.co/spaces/llava-hf/llava-4bit/resolve/main/examples/baklava.png", "How to make this pastry?")

		## [{'generated_text': 'To create these pastries, you will need a few key ingredients and tools. Firstly, gather the dough by combining flour with water in your mixing bowl until it forms into an elastic ball that can be easily rolled out on top of another surface or table without breaking apart (like pizza).'}]
		```

		### OCR
		@@ -32,0 +45,0 @@

src/tasks/image-to-text/data.ts

		@@ -35,27 +35,23 @@ import type { TaskDataCustom } from "..";
		description: "A robust image captioning model.",
		id: "Salesforce/blip-image-captioning-large",
		id: "Salesforce/blip2-opt-2.7b",
		},
		{
		description: "A strong image captioning model.",
		id: "nlpconnect/vit-gpt2-image-captioning",
		description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
		id: "microsoft/kosmos-2-patch14-224",
		},
		{
		description: "A strong optical character recognition model.",
		id: "microsoft/trocr-base-printed",
		id: "facebook/nougat-base",
		},
		{
		description: "A strong visual question answering model for scientific diagrams.",
		id: "google/pix2struct-ai2d-base",
		description: "A powerful model that lets you have a conversation with the image.",
		id: "llava-hf/llava-1.5-7b-hf",
		},
		],
		spaces: [
		{
		description: "A strong captioning model for UI components.",
		id: "google/pix2struct-widget-captioning-base",
		description: "An application that compares various image captioning models.",
		id: "nielsr/comparing-captioning-models",
		},
		{
		description: "A captioning model for images that contain text.",
		id: "google/pix2struct-textcaps-base",
		},
		],
		spaces: [
		{
		description: "A robust image captioning application.",
		@@ -62,0 +58,0 @@ id: "flax-community/image-captioning",

src/tasks/index.ts

		@@ -54,4 +54,4 @@ import { type PipelineType, PIPELINE_DATA } from "../pipelines";
		"image-segmentation": ["transformers", "transformers.js"],
		"image-to-image": ["diffusers", "transformers.js"],
		"image-to-text": ["transformers.js"],
		"image-to-image": ["diffusers", "transformers", "transformers.js"],
		"image-to-text": ["transformers", "transformers.js"],
		"image-to-video": ["diffusers"],
		@@ -58,0 +58,0 @@ "video-classification": ["transformers"],

src/tasks/object-detection/data.ts

		@@ -43,3 +43,2 @@ import type { TaskDataCustom } from "..";
		{
		// TO DO: write description
		description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
		@@ -55,4 +54,8 @@ id: "facebook/detr-resnet-50",
		{
		description: "Leaderboard to compare various object detection models across several metrics.",
		id: "hf-vision/object_detection_leaderboard",
		},
		{
		description: "An object detection application that can detect unseen objects out of the box.",
		id: "adirik/OWL-ViT",
		id: "merve/owlv2",
		},
		@@ -59,0 +62,0 @@ {

src/tasks/table-question-answering/about.md

		@@ -34,3 +34,3 @@ ## Use Cases

		print(tqa(table=table, query=query)['cells'][0])
		print(tqa(table=table, query=question)['cells'][0])
		#53
		@@ -37,0 +37,0 @@

src/tasks/text-generation/about.md

		@@ -45,2 +45,6 @@ This task covers guides on both [text-generation](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads) and [text-to-text generation](https://huggingface.co/models?pipeline_tag=text2text-generation&sort=downloads) models. Popular large language models that are used for chats or following instructions are also covered in this task. You can find the list of selected open-source large language models [here](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), ranked by their performance scores.

		## Text Generation from Image and Text

		There are language models that can input both text and image and output text, called vision language models. [LLaVA](https://huggingface.co/llava-hf/llava-1.5-7b-hf) and [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b) are good examples. Although they work just like other language models by means of input parameters for generation, since they also take input images, you can use them with `image-to-text` pipeline. You can find information about the pipeline in [image-to-text](https://huggingface.co/tasks/image-to-text) task page.

		## Inference
		@@ -47,0 +51,0 @@

src/tasks/text-to-image/data.ts

		@@ -48,10 +48,8 @@ import type { TaskDataCustom } from "..";
		{
		description:
		"A latent text-to-image diffusion model capable of generating photo-realistic images given any text input.",
		id: "CompVis/stable-diffusion-v1-4",
		description: "One of the most powerful image generation models that can generate realistic outputs.",
		id: "stabilityai/stable-diffusion-xl-base-1.0",
		},
		{
		description:
		"A model that can be used to generate images based on text prompts. The DALL·E Mega model is the largest version of DALLE Mini.",
		id: "dalle-mini/dalle-mega",
		description: "A powerful yet fast image generation model.",
		id: "latent-consistency/lcm-lora-sdxl",
		},
		@@ -73,15 +71,19 @@ {
		{
		description: "An text-to-image application that can generate coherent text inside the image.",
		description: "A text-to-image application to generate comics.",
		id: "jbilcke-hf/ai-comic-factory",
		},
		{
		description: "A text-to-image application that can generate coherent text inside the image.",
		id: "DeepFloyd/IF",
		},
		{
		description: "An powerful text-to-image application that can generate images.",
		id: "kakaobrain/karlo",
		description: "A powerful yet very fast image generation application.",
		id: "latent-consistency/lcm-lora-for-sdxl",
		},
		{
		description: "An powerful text-to-image application that can generates 3D representations.",
		description: "A powerful text-to-image application that can generate 3D representations.",
		id: "hysts/Shap-E",
		},
		{
		description: "A strong application for `text-to-image`, `image-to-image` and image inpainting.",
		description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
		id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI",
		@@ -88,0 +90,0 @@ },

src/tasks/text-to-video/data.ts

		@@ -71,3 +71,3 @@ import type { TaskDataCustom } from "..";
		description: "A strong model for video generation.",
		id: "PAIR/text2video-zero-controlnet-canny-arcane",
		id: "Vchitect/LaVie",
		},
		@@ -80,3 +80,3 @@ {
		description: "A text-to-video generation model with high quality and smooth outputs.",
		id: "cerspense/zeroscope_v2_576w",
		id: "hotshotco/Hotshot-XL",
		},
		@@ -91,3 +91,3 @@ ],
		description: "An application that generates video from image and text.",
		id: "TempoFunk/makeavid-sd-jax",
		id: "Vchitect/LaVie",
		},
		@@ -94,0 +94,0 @@ {

src/tasks/visual-question-answering/data.ts

		@@ -75,2 +75,6 @@ import type { TaskDataCustom } from "..";
		{
		description: "An application that compares visual question answering models across different tasks.",
		id: "merve/pix2struct",
		},
		{
		description: "An application that can answer questions based on images.",
		@@ -77,0 +81,0 @@ id: "nielsr/vilt-vqa",

dist/index.js

Sorry, the diff of this file is too big to display

dist/index.mjs

Sorry, the diff of this file is not supported yet

@huggingface/tasks - npm Package Compare versions

Improved metrics