Source code for redisvl.utils.vectorize.text.voyageai

import os
from typing import Any, Callable, Dict, List, Optional

from pydantic.v1 import PrivateAttr
from tenacity import retry, stop_after_attempt, wait_random_exponential
from tenacity.retry import retry_if_not_exception_type

from redisvl.utils.vectorize.base import BaseVectorizer

# ignore that voyageai isn't imported
# mypy: disable-error-code="name-defined"


[docs] class VoyageAITextVectorizer(BaseVectorizer): """The VoyageAITextVectorizer class utilizes VoyageAI's API to generate embeddings for text data. This vectorizer is designed to interact with VoyageAI's /embed API, requiring an API key for authentication. The key can be provided directly in the `api_config` dictionary or through the `VOYAGE_API_KEY` environment variable. User must obtain an API key from VoyageAI's website (https://dash.voyageai.com/). Additionally, the `voyageai` python client must be installed with `pip install voyageai`. The vectorizer supports both synchronous and asynchronous operations, allows for batch processing of texts and flexibility in handling preprocessing tasks. .. code-block:: python from redisvl.utils.vectorize import VoyageAITextVectorizer vectorizer = VoyageAITextVectorizer( model="voyage-large-2", api_config={"api_key": "your-voyageai-api-key"} # OR set VOYAGE_API_KEY in your env ) query_embedding = vectorizer.embed( text="your input query text here", input_type="query" ) doc_embeddings = vectorizer.embed_many( texts=["your document text", "more document text"], input_type="document" ) """ _client: Any = PrivateAttr() _aclient: Any = PrivateAttr() def __init__( self, model: str = "voyage-large-2", api_config: Optional[Dict] = None, dtype: str = "float32", ): """Initialize the VoyageAI vectorizer. Visit https://docs.voyageai.com/docs/embeddings to learn about embeddings and check the available models. Args: model (str): Model to use for embedding. Defaults to "voyage-large-2". api_config (Optional[Dict], optional): Dictionary containing the API key. Defaults to None. dtype (str): the default datatype to use when embedding text as byte arrays. Used when setting `as_buffer=True` in calls to embed() and embed_many(). Defaults to 'float32'. Raises: ImportError: If the voyageai library is not installed. ValueError: If the API key is not provided. """ self._initialize_client(api_config) super().__init__(model=model, dims=self._set_model_dims(model), dtype=dtype) def _initialize_client(self, api_config: Optional[Dict]): """ Setup the VoyageAI clients using the provided API key or an environment variable. """ # Dynamic import of the voyageai module try: from voyageai import AsyncClient, Client except ImportError: raise ImportError( "VoyageAI vectorizer requires the voyageai library. \ Please install with `pip install voyageai`" ) # Fetch the API key from api_config or environment variable api_key = ( api_config.get("api_key") if api_config else os.getenv("VOYAGE_API_KEY") ) if not api_key: raise ValueError( "VoyageAI API key is required. " "Provide it in api_config or set the VOYAGE_API_KEY environment variable." ) self._client = Client(api_key=api_key) self._aclient = AsyncClient(api_key=api_key) def _set_model_dims(self, model) -> int: try: embedding = self._client.embed( texts=["dimension test"], model=model, input_type="document", ).embeddings[0] except (KeyError, IndexError) as ke: raise ValueError(f"Unexpected response from the VoyageAI API: {str(ke)}") except Exception as e: # pylint: disable=broad-except # fall back (TODO get more specific) raise ValueError(f"Error setting embedding model dimensions: {str(e)}") return len(embedding)
[docs] def embed( self, text: str, preprocess: Optional[Callable] = None, as_buffer: bool = False, **kwargs, ) -> List[float]: """Embed a chunk of text using the VoyageAI Embeddings API. Can provide the embedding `input_type` as a `kwarg` to this method that specifies the type of input you're giving to the model. For retrieval/search use cases, we recommend specifying this argument when encoding queries or documents to enhance retrieval quality. Embeddings generated with and without the input_type argument are compatible. Supported input types are ``document`` and ``query`` When hydrating your Redis DB, the documents you want to search over should be embedded with input_type="document" and when you are querying the database, you should set the input_type="query". Args: text (str): Chunk of text to embed. preprocess (Optional[Callable], optional): Optional preprocessing callable to perform before vectorization. Defaults to None. as_buffer (bool, optional): Whether to convert the raw embedding to a byte string. Defaults to False. input_type (str): Specifies the type of input passed to the model. truncation (bool): Whether to truncate the input texts to fit within the context length. Check https://docs.voyageai.com/docs/embeddings Returns: List[float]: Embedding. Raises: TypeError: If an invalid input_type is provided. """ return self.embed_many( texts=[text], preprocess=preprocess, as_buffer=as_buffer, **kwargs )[0]
[docs] @retry( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(TypeError), ) def embed_many( self, texts: List[str], preprocess: Optional[Callable] = None, batch_size: Optional[int] = None, as_buffer: bool = False, **kwargs, ) -> List[List[float]]: """Embed many chunks of text using the VoyageAI Embeddings API. Can provide the embedding `input_type` as a `kwarg` to this method that specifies the type of input you're giving to the model. For retrieval/search use cases, we recommend specifying this argument when encoding queries or documents to enhance retrieval quality. Embeddings generated with and without the input_type argument are compatible. Supported input types are ``document`` and ``query`` When hydrating your Redis DB, the documents you want to search over should be embedded with input_type="document" and when you are querying the database, you should set the input_type="query". Args: texts (List[str]): List of text chunks to embed. preprocess (Optional[Callable], optional): Optional preprocessing callable to perform before vectorization. Defaults to None. batch_size (int, optional): Batch size of texts to use when creating embeddings. . as_buffer (bool, optional): Whether to convert the raw embedding to a byte string. Defaults to False. input_type (str): Specifies the type of input passed to the model. truncation (bool): Whether to truncate the input texts to fit within the context length. Check https://docs.voyageai.com/docs/embeddings Returns: List[List[float]]: List of embeddings. Raises: TypeError: If an invalid input_type is provided. """ input_type = kwargs.get("input_type") truncation = kwargs.get("truncation") dtype = kwargs.pop("dtype", self.dtype) if not isinstance(texts, list): raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") if input_type is not None and input_type not in ["document", "query"]: raise TypeError( "Must pass in a allowed value for voyageai embedding input_type. \ See https://docs.voyageai.com/docs/embeddings." ) if truncation is not None and not isinstance(truncation, bool): raise TypeError("Truncation (optional) parameter is a bool.") if batch_size is None: batch_size = ( 72 if self.model in ["voyage-2", "voyage-02"] else ( 30 if self.model == "voyage-3-lite" else (10 if self.model == "voyage-3" else 7) ) ) embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): response = self._client.embed( texts=batch, model=self.model, input_type=input_type ) embeddings += [ self._process_embedding(embedding, as_buffer, dtype) for embedding in response.embeddings ] return embeddings
[docs] async def aembed_many( self, texts: List[str], preprocess: Optional[Callable] = None, batch_size: Optional[int] = None, as_buffer: bool = False, **kwargs, ) -> List[List[float]]: """Embed many chunks of text using the VoyageAI Embeddings API. Can provide the embedding `input_type` as a `kwarg` to this method that specifies the type of input you're giving to the model. For retrieval/search use cases, we recommend specifying this argument when encoding queries or documents to enhance retrieval quality. Embeddings generated with and without the input_type argument are compatible. Supported input types are ``document`` and ``query`` When hydrating your Redis DB, the documents you want to search over should be embedded with input_type="document" and when you are querying the database, you should set the input_type="query". Args: texts (List[str]): List of text chunks to embed. preprocess (Optional[Callable], optional): Optional preprocessing callable to perform before vectorization. Defaults to None. batch_size (int, optional): Batch size of texts to use when creating embeddings. . as_buffer (bool, optional): Whether to convert the raw embedding to a byte string. Defaults to False. input_type (str): Specifies the type of input passed to the model. truncation (bool): Whether to truncate the input texts to fit within the context length. Check https://docs.voyageai.com/docs/embeddings Returns: List[List[float]]: List of embeddings. Raises: TypeError: In an invalid input_type is provided. """ input_type = kwargs.get("input_type") truncation = kwargs.get("truncation") dtype = kwargs.pop("dtype", self.dtype) if not isinstance(texts, list): raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") if input_type is not None and input_type not in ["document", "query"]: raise TypeError( "Must pass in a allowed value for voyageai embedding input_type. \ See https://docs.voyageai.com/docs/embeddings." ) if truncation is not None and not isinstance(truncation, bool): raise TypeError("Truncation (optional) parameter is a bool.") if batch_size is None: batch_size = ( 72 if self.model in ["voyage-2", "voyage-02"] else ( 30 if self.model == "voyage-3-lite" else (10 if self.model == "voyage-3" else 7) ) ) embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): response = await self._aclient.embed( texts=batch, model=self.model, input_type=input_type ) embeddings += [ self._process_embedding(embedding, as_buffer, dtype) for embedding in response.embeddings ] return embeddings
[docs] async def aembed( self, text: str, preprocess: Optional[Callable] = None, as_buffer: bool = False, **kwargs, ) -> List[float]: """Embed a chunk of text using the VoyageAI Embeddings API. Can provide the embedding `input_type` as a `kwarg` to this method that specifies the type of input you're giving to the model. For retrieval/search use cases, we recommend specifying this argument when encoding queries or documents to enhance retrieval quality. Embeddings generated with and without the input_type argument are compatible. Supported input types are ``document`` and ``query`` When hydrating your Redis DB, the documents you want to search over should be embedded with input_type="document" and when you are querying the database, you should set the input_type="query". Args: text (str): Chunk of text to embed. preprocess (Optional[Callable], optional): Optional preprocessing callable to perform before vectorization. Defaults to None. as_buffer (bool, optional): Whether to convert the raw embedding to a byte string. Defaults to False. input_type (str): Specifies the type of input passed to the model. truncation (bool): Whether to truncate the input texts to fit within the context length. Check https://docs.voyageai.com/docs/embeddings Returns: List[float]: Embedding. Raises: TypeError: In an invalid input_type is provided. """ result = await self.aembed_many( texts=[text], preprocess=preprocess, as_buffer=as_buffer, **kwargs ) return result[0]