Source code for caikit.interfaces.nlp.tasks

# Copyright The Caikit Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module holds the Task definitions for all common NLP tasks
"""

# Standard
from typing import Iterable, List

# Local
from ...core import TaskBase, task
from ...core.data_model.json_dict import JsonDict
from .data_model import SentenceSimilarityResult, SentenceSimilarityResults
from .data_model.classification import (
    ClassificationResults,
    ClassifiedGeneratedTextResult,
    ClassifiedGeneratedTextStreamResult,
    TokenClassificationResults,
    TokenClassificationStreamResult,
)
from .data_model.embedding_vectors import EmbeddingResult, EmbeddingResults
from .data_model.reranker import RerankResult, RerankResults
from .data_model.text import (
    ChunkerTokenizationStreamResult,
    TokenizationResults,
    TokenizationStreamResult,
)
from .data_model.text_generation import GeneratedTextResult, GeneratedTextStreamResult


[docs] @task( unary_parameters={"text": str}, unary_output_type=GeneratedTextResult, streaming_output_type=Iterable[GeneratedTextStreamResult], ) class TextGenerationTask(TaskBase): """The Text Generation Task is responsible for taking input prompting text and generating additional text from that prompt. """
[docs] @task( required_parameters={"text": str}, output_type=ClassificationResults, ) class TextClassificationTask(TaskBase): """The text classification task is responsible for assigning a label or class to text."""
[docs] @task( unary_parameters={"text": str}, streaming_parameters={"text_stream": Iterable[str]}, unary_output_type=TokenClassificationResults, streaming_output_type=Iterable[TokenClassificationStreamResult], ) class TokenClassificationTask(TaskBase): """The token classification task is responsible for assigning a label to individual tokens in a document. """
[docs] @task( unary_parameters={"text": str}, streaming_parameters={"text_stream": Iterable[str]}, unary_output_type=TokenizationResults, streaming_output_type=Iterable[TokenizationStreamResult], ) class TokenizationTask(TaskBase): """The tokenization task is responsible for splitting a document into tokens."""
[docs] @task( unary_parameters={"text": str}, unary_output_type=ClassifiedGeneratedTextResult, streaming_output_type=Iterable[ClassifiedGeneratedTextStreamResult], ) class ClassificationWithTextGenerationTask(TaskBase): """The classification with text generation task is responsible for taking input prompting text, generating additional text from that prompt and classifying the generated text based on detectors. """
[docs] @task( required_parameters={"text": str}, output_type=EmbeddingResult, ) class EmbeddingTask(TaskBase): """Return a text embedding for the input text string"""
[docs] @task( required_parameters={"texts": List[str]}, output_type=EmbeddingResults, ) class EmbeddingTasks(TaskBase): """Return a text embedding for each text string in the input list"""
[docs] @task( required_parameters={ "documents": List[JsonDict], "query": str, }, output_type=RerankResult, ) class RerankTask(TaskBase): """Returns an ordered list ranking the most relevant documents for the query Required parameters: query: The search query documents: JSON documents containing "text" or alternative "_text" to search Returns: The top_n documents in order of relevance (most relevant first). For each, a score and document index (position in input) is returned. The original document JSON is returned depending on optional args. The top_n optional parameter limits the results when used. """
[docs] @task( required_parameters={ "documents": List[JsonDict], "queries": List[str], }, output_type=RerankResults, ) class RerankTasks(TaskBase): """Returns an ordered list for each query ranking the most relevant documents for the query Required parameters: queries: The search queries documents: JSON documents containing "text" or alternative "_text" to search Returns: Results in order of the queries. In each query result: The query text is optionally included for visual convenience. The top_n documents in order of relevance (most relevant first). For each, a score and document index (position in input) is returned. The original document JSON is returned depending on optional args. The top_n optional parameter limits the results when used. """
[docs] @task( required_parameters={"source_sentence": str, "sentences": List[str]}, output_type=SentenceSimilarityResult, ) class SentenceSimilarityTask(TaskBase): """Compare the source_sentence to each of the sentences. Result contains a list of scores in the order of the input sentences. """
[docs] @task( required_parameters={"source_sentences": List[str], "sentences": List[str]}, output_type=SentenceSimilarityResults, ) class SentenceSimilarityTasks(TaskBase): """Compare each of the source_sentences to each of the sentences. Returns a list of results in the order of the source_sentences. Each result contains a list of scores in the order of the input sentences. """
[docs] @task( unary_parameters={"text": str}, streaming_parameters={ "text_stream": Iterable[str], "input_index_stream": Iterable[int], }, unary_output_type=TokenizationResults, streaming_output_type=Iterable[ChunkerTokenizationStreamResult], ) class ChunkerTokenizationTask(TaskBase): """The Chunk tokenization task is responsible for taking input text and giving out chunks of pre-configured type, like sentences. """