ai-station/.venv/lib/python3.12/site-packages/litellm/constants.py

921 lines
33 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
from typing import List, Literal
ROUTER_MAX_FALLBACKS = int(os.getenv("ROUTER_MAX_FALLBACKS", 5))
DEFAULT_BATCH_SIZE = int(os.getenv("DEFAULT_BATCH_SIZE", 512))
DEFAULT_FLUSH_INTERVAL_SECONDS = int(os.getenv("DEFAULT_FLUSH_INTERVAL_SECONDS", 5))
DEFAULT_S3_FLUSH_INTERVAL_SECONDS = int(
os.getenv("DEFAULT_S3_FLUSH_INTERVAL_SECONDS", 10)
)
DEFAULT_S3_BATCH_SIZE = int(os.getenv("DEFAULT_S3_BATCH_SIZE", 512))
DEFAULT_SQS_FLUSH_INTERVAL_SECONDS = int(
os.getenv("DEFAULT_SQS_FLUSH_INTERVAL_SECONDS", 10)
)
DEFAULT_SQS_BATCH_SIZE = int(os.getenv("DEFAULT_SQS_BATCH_SIZE", 512))
SQS_SEND_MESSAGE_ACTION = "SendMessage"
SQS_API_VERSION = "2012-11-05"
DEFAULT_MAX_RETRIES = int(os.getenv("DEFAULT_MAX_RETRIES", 2))
DEFAULT_MAX_RECURSE_DEPTH = int(os.getenv("DEFAULT_MAX_RECURSE_DEPTH", 100))
DEFAULT_MAX_RECURSE_DEPTH_SENSITIVE_DATA_MASKER = int(
os.getenv("DEFAULT_MAX_RECURSE_DEPTH_SENSITIVE_DATA_MASKER", 10)
)
DEFAULT_FAILURE_THRESHOLD_PERCENT = float(
os.getenv("DEFAULT_FAILURE_THRESHOLD_PERCENT", 0.5)
) # default cooldown a deployment if 50% of requests fail in a given minute
DEFAULT_MAX_TOKENS = int(os.getenv("DEFAULT_MAX_TOKENS", 4096))
DEFAULT_ALLOWED_FAILS = int(os.getenv("DEFAULT_ALLOWED_FAILS", 3))
DEFAULT_REDIS_SYNC_INTERVAL = int(os.getenv("DEFAULT_REDIS_SYNC_INTERVAL", 1))
DEFAULT_COOLDOWN_TIME_SECONDS = int(os.getenv("DEFAULT_COOLDOWN_TIME_SECONDS", 5))
DEFAULT_REPLICATE_POLLING_RETRIES = int(
os.getenv("DEFAULT_REPLICATE_POLLING_RETRIES", 5)
)
DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = int(
os.getenv("DEFAULT_REPLICATE_POLLING_DELAY_SECONDS", 1)
)
DEFAULT_IMAGE_TOKEN_COUNT = int(os.getenv("DEFAULT_IMAGE_TOKEN_COUNT", 250))
DEFAULT_IMAGE_WIDTH = int(os.getenv("DEFAULT_IMAGE_WIDTH", 300))
DEFAULT_IMAGE_HEIGHT = int(os.getenv("DEFAULT_IMAGE_HEIGHT", 300))
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = int(
os.getenv("MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB", 1024)
) # 1MB = 1024KB
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = int(
os.getenv("SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD", 1000)
) # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
DEFAULT_REASONING_EFFORT_DISABLE_THINKING_BUDGET = int(
os.getenv("DEFAULT_REASONING_EFFORT_DISABLE_THINKING_BUDGET", 0)
)
DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET = int(
os.getenv("DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET", 1024)
)
DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET = int(
os.getenv("DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET", 2048)
)
DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET = int(
os.getenv("DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET", 4096)
)
MAX_TOKEN_TRIMMING_ATTEMPTS = int(
os.getenv("MAX_TOKEN_TRIMMING_ATTEMPTS", 10)
) # Maximum number of attempts to trim the message
########## Networking constants ##############################################################
_DEFAULT_TTL_FOR_HTTPX_CLIENTS = 3600 # 1 hour, re-use the same httpx client for 1 hour
########### v2 Architecture constants for managing writing updates to the database ###########
REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer"
REDIS_DAILY_TEAM_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_team_spend_update_buffer"
REDIS_DAILY_TAG_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_tag_spend_update_buffer"
MAX_REDIS_BUFFER_DEQUEUE_COUNT = int(os.getenv("MAX_REDIS_BUFFER_DEQUEUE_COUNT", 100))
MAX_SIZE_IN_MEMORY_QUEUE = int(os.getenv("MAX_SIZE_IN_MEMORY_QUEUE", 10000))
MAX_IN_MEMORY_QUEUE_FLUSH_COUNT = int(
os.getenv("MAX_IN_MEMORY_QUEUE_FLUSH_COUNT", 1000)
)
###############################################################################################
MINIMUM_PROMPT_CACHE_TOKEN_COUNT = int(
os.getenv("MINIMUM_PROMPT_CACHE_TOKEN_COUNT", 1024)
) # minimum number of tokens to cache a prompt by Anthropic
DEFAULT_TRIM_RATIO = float(
os.getenv("DEFAULT_TRIM_RATIO", 0.75)
) # default ratio of tokens to trim from the end of a prompt
HOURS_IN_A_DAY = int(os.getenv("HOURS_IN_A_DAY", 24))
DAYS_IN_A_WEEK = int(os.getenv("DAYS_IN_A_WEEK", 7))
DAYS_IN_A_MONTH = int(os.getenv("DAYS_IN_A_MONTH", 28))
DAYS_IN_A_YEAR = int(os.getenv("DAYS_IN_A_YEAR", 365))
REPLICATE_MODEL_NAME_WITH_ID_LENGTH = int(
os.getenv("REPLICATE_MODEL_NAME_WITH_ID_LENGTH", 64)
)
#### TOKEN COUNTING ####
FUNCTION_DEFINITION_TOKEN_COUNT = int(os.getenv("FUNCTION_DEFINITION_TOKEN_COUNT", 9))
SYSTEM_MESSAGE_TOKEN_COUNT = int(os.getenv("SYSTEM_MESSAGE_TOKEN_COUNT", 4))
TOOL_CHOICE_OBJECT_TOKEN_COUNT = int(os.getenv("TOOL_CHOICE_OBJECT_TOKEN_COUNT", 4))
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT = int(
os.getenv("DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT", 10)
)
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT = int(
os.getenv("DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT", 20)
)
MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES = int(
os.getenv("MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES", 768)
)
MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES = int(
os.getenv("MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES", 2000)
)
MAX_TILE_WIDTH = int(os.getenv("MAX_TILE_WIDTH", 512))
MAX_TILE_HEIGHT = int(os.getenv("MAX_TILE_HEIGHT", 512))
OPENAI_FILE_SEARCH_COST_PER_1K_CALLS = float(
os.getenv("OPENAI_FILE_SEARCH_COST_PER_1K_CALLS", 2.5 / 1000)
)
# Azure OpenAI Assistants feature costs
# Source: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/
AZURE_FILE_SEARCH_COST_PER_GB_PER_DAY = float(
os.getenv("AZURE_FILE_SEARCH_COST_PER_GB_PER_DAY", 0.1) # $0.1 USD per 1 GB/Day
)
AZURE_CODE_INTERPRETER_COST_PER_SESSION = float(
os.getenv(
"AZURE_CODE_INTERPRETER_COST_PER_SESSION", 0.03
) # $0.03 USD per 1 Session
)
AZURE_COMPUTER_USE_INPUT_COST_PER_1K_TOKENS = float(
os.getenv(
"AZURE_COMPUTER_USE_INPUT_COST_PER_1K_TOKENS", 3.0
) # $0.003 USD per 1K Tokens
)
AZURE_COMPUTER_USE_OUTPUT_COST_PER_1K_TOKENS = float(
os.getenv(
"AZURE_COMPUTER_USE_OUTPUT_COST_PER_1K_TOKENS", 12.0
) # $0.012 USD per 1K Tokens
)
AZURE_VECTOR_STORE_COST_PER_GB_PER_DAY = float(
os.getenv(
"AZURE_VECTOR_STORE_COST_PER_GB_PER_DAY", 0.1
) # $0.1 USD per 1 GB/Day (same as file search)
)
MIN_NON_ZERO_TEMPERATURE = float(os.getenv("MIN_NON_ZERO_TEMPERATURE", 0.0001))
#### RELIABILITY ####
REPEATED_STREAMING_CHUNK_LIMIT = int(
os.getenv("REPEATED_STREAMING_CHUNK_LIMIT", 100)
) # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
DEFAULT_MAX_LRU_CACHE_SIZE = int(os.getenv("DEFAULT_MAX_LRU_CACHE_SIZE", 16))
INITIAL_RETRY_DELAY = float(os.getenv("INITIAL_RETRY_DELAY", 0.5))
MAX_RETRY_DELAY = float(os.getenv("MAX_RETRY_DELAY", 8.0))
JITTER = float(os.getenv("JITTER", 0.75))
DEFAULT_IN_MEMORY_TTL = int(
os.getenv("DEFAULT_IN_MEMORY_TTL", 5)
) # default time to live for the in-memory cache
DEFAULT_POLLING_INTERVAL = float(
os.getenv("DEFAULT_POLLING_INTERVAL", 0.03)
) # default polling interval for the scheduler
AZURE_OPERATION_POLLING_TIMEOUT = int(os.getenv("AZURE_OPERATION_POLLING_TIMEOUT", 120))
REDIS_SOCKET_TIMEOUT = float(os.getenv("REDIS_SOCKET_TIMEOUT", 0.1))
REDIS_CONNECTION_POOL_TIMEOUT = int(os.getenv("REDIS_CONNECTION_POOL_TIMEOUT", 5))
NON_LLM_CONNECTION_TIMEOUT = int(
os.getenv("NON_LLM_CONNECTION_TIMEOUT", 15)
) # timeout for adjacent services (e.g. jwt auth)
MAX_EXCEPTION_MESSAGE_LENGTH = int(os.getenv("MAX_EXCEPTION_MESSAGE_LENGTH", 2000))
BEDROCK_MAX_POLICY_SIZE = int(os.getenv("BEDROCK_MAX_POLICY_SIZE", 75))
REPLICATE_POLLING_DELAY_SECONDS = float(
os.getenv("REPLICATE_POLLING_DELAY_SECONDS", 0.5)
)
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS = int(
os.getenv("DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS", 4096)
)
TOGETHER_AI_4_B = int(os.getenv("TOGETHER_AI_4_B", 4))
TOGETHER_AI_8_B = int(os.getenv("TOGETHER_AI_8_B", 8))
TOGETHER_AI_21_B = int(os.getenv("TOGETHER_AI_21_B", 21))
TOGETHER_AI_41_B = int(os.getenv("TOGETHER_AI_41_B", 41))
TOGETHER_AI_80_B = int(os.getenv("TOGETHER_AI_80_B", 80))
TOGETHER_AI_110_B = int(os.getenv("TOGETHER_AI_110_B", 110))
TOGETHER_AI_EMBEDDING_150_M = int(os.getenv("TOGETHER_AI_EMBEDDING_150_M", 150))
TOGETHER_AI_EMBEDDING_350_M = int(os.getenv("TOGETHER_AI_EMBEDDING_350_M", 350))
QDRANT_SCALAR_QUANTILE = float(os.getenv("QDRANT_SCALAR_QUANTILE", 0.99))
QDRANT_VECTOR_SIZE = int(os.getenv("QDRANT_VECTOR_SIZE", 1536))
CACHED_STREAMING_CHUNK_DELAY = float(os.getenv("CACHED_STREAMING_CHUNK_DELAY", 0.02))
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = int(
os.getenv("MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB", 512)
)
DEFAULT_MAX_TOKENS_FOR_TRITON = int(os.getenv("DEFAULT_MAX_TOKENS_FOR_TRITON", 2000))
#### Networking settings ####
request_timeout: float = float(os.getenv("REQUEST_TIMEOUT", 6000)) # time in seconds
STREAM_SSE_DONE_STRING: str = "[DONE]"
STREAM_SSE_DATA_PREFIX: str = "data: "
### SPEND TRACKING ###
DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND = float(
os.getenv("DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND", 0.001400)
) # price per second for a100 80GB
FIREWORKS_AI_56_B_MOE = int(os.getenv("FIREWORKS_AI_56_B_MOE", 56))
FIREWORKS_AI_176_B_MOE = int(os.getenv("FIREWORKS_AI_176_B_MOE", 176))
FIREWORKS_AI_4_B = int(os.getenv("FIREWORKS_AI_4_B", 4))
FIREWORKS_AI_16_B = int(os.getenv("FIREWORKS_AI_16_B", 16))
FIREWORKS_AI_80_B = int(os.getenv("FIREWORKS_AI_80_B", 80))
#### Logging callback constants ####
REDACTED_BY_LITELM_STRING = "REDACTED_BY_LITELM"
MAX_LANGFUSE_INITIALIZED_CLIENTS = int(
os.getenv("MAX_LANGFUSE_INITIALIZED_CLIENTS", 50)
)
DD_TRACER_STREAMING_CHUNK_YIELD_RESOURCE = os.getenv(
"DD_TRACER_STREAMING_CHUNK_YIELD_RESOURCE", "streaming.chunk.yield"
)
############### LLM Provider Constants ###############
### ANTHROPIC CONSTANTS ###
ANTHROPIC_WEB_SEARCH_TOOL_MAX_USES = {
"low": 1,
"medium": 5,
"high": 10,
}
DEFAULT_IMAGE_ENDPOINT_MODEL = "dall-e-2"
LITELLM_CHAT_PROVIDERS = [
"openai",
"openai_like",
"bytez",
"xai",
"custom_openai",
"text-completion-openai",
"cohere",
"cohere_chat",
"clarifai",
"anthropic",
"anthropic_text",
"replicate",
"huggingface",
"together_ai",
"datarobot",
"openrouter",
"vertex_ai",
"vertex_ai_beta",
"gemini",
"ai21",
"baseten",
"azure",
"azure_text",
"azure_ai",
"sagemaker",
"sagemaker_chat",
"bedrock",
"vllm",
"nlp_cloud",
"petals",
"oobabooga",
"ollama",
"ollama_chat",
"deepinfra",
"perplexity",
"mistral",
"groq",
"nvidia_nim",
"cerebras",
"ai21_chat",
"volcengine",
"codestral",
"text-completion-codestral",
"deepseek",
"sambanova",
"maritalk",
"cloudflare",
"fireworks_ai",
"friendliai",
"watsonx",
"watsonx_text",
"triton",
"predibase",
"databricks",
"empower",
"github",
"custom",
"litellm_proxy",
"hosted_vllm",
"llamafile",
"lm_studio",
"galadriel",
"github_copilot", # GitHub Copilot Chat API
"novita",
"meta_llama",
"featherless_ai",
"nscale",
"nebius",
"dashscope",
"moonshot",
"v0",
"oci",
"morph",
"lambda_ai",
]
LITELLM_EMBEDDING_PROVIDERS_SUPPORTING_INPUT_ARRAY_OF_TOKENS = [
"openai",
"azure",
"hosted_vllm",
"nebius",
]
OPENAI_CHAT_COMPLETION_PARAMS = [
"functions",
"function_call",
"temperature",
"temperature",
"top_p",
"n",
"stream",
"stream_options",
"stop",
"max_completion_tokens",
"modalities",
"prediction",
"audio",
"max_tokens",
"presence_penalty",
"frequency_penalty",
"logit_bias",
"user",
"request_timeout",
"api_base",
"api_version",
"api_key",
"deployment_id",
"organization",
"base_url",
"default_headers",
"timeout",
"response_format",
"seed",
"tools",
"tool_choice",
"max_retries",
"parallel_tool_calls",
"logprobs",
"top_logprobs",
"reasoning_effort",
"extra_headers",
"thinking",
"web_search_options",
]
OPENAI_TRANSCRIPTION_PARAMS = [
"language",
"response_format",
"timestamp_granularities",
]
OPENAI_EMBEDDING_PARAMS = ["dimensions", "encoding_format", "user"]
DEFAULT_EMBEDDING_PARAM_VALUES = {
**{k: None for k in OPENAI_EMBEDDING_PARAMS},
"model": None,
"custom_llm_provider": "",
"input": None,
}
DEFAULT_CHAT_COMPLETION_PARAM_VALUES = {
"functions": None,
"function_call": None,
"temperature": None,
"top_p": None,
"n": None,
"stream": None,
"stream_options": None,
"stop": None,
"max_tokens": None,
"max_completion_tokens": None,
"modalities": None,
"prediction": None,
"audio": None,
"presence_penalty": None,
"frequency_penalty": None,
"logit_bias": None,
"user": None,
"model": None,
"custom_llm_provider": "",
"response_format": None,
"seed": None,
"tools": None,
"tool_choice": None,
"max_retries": None,
"logprobs": None,
"top_logprobs": None,
"extra_headers": None,
"api_version": None,
"parallel_tool_calls": None,
"drop_params": None,
"allowed_openai_params": None,
"additional_drop_params": None,
"messages": None,
"reasoning_effort": None,
"thinking": None,
"web_search_options": None,
}
openai_compatible_endpoints: List = [
"api.perplexity.ai",
"api.endpoints.anyscale.com/v1",
"api.deepinfra.com/v1/openai",
"api.mistral.ai/v1",
"codestral.mistral.ai/v1/chat/completions",
"codestral.mistral.ai/v1/fim/completions",
"api.groq.com/openai/v1",
"https://integrate.api.nvidia.com/v1",
"api.deepseek.com/v1",
"api.together.xyz/v1",
"app.empower.dev/api/v1",
"https://api.friendli.ai/serverless/v1",
"api.sambanova.ai/v1",
"api.x.ai/v1",
"api.galadriel.ai/v1",
"api.llama.com/compat/v1/",
"api.featherless.ai/v1",
"inference.api.nscale.com/v1",
"api.studio.nebius.ai/v1",
"https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
"https://api.moonshot.ai/v1",
"https://api.v0.dev/v1",
"https://api.morphllm.com/v1",
"https://api.lambda.ai/v1",
"https://api.hyperbolic.xyz/v1",
]
openai_compatible_providers: List = [
"anyscale",
"groq",
"nvidia_nim",
"cerebras",
"sambanova",
"ai21_chat",
"ai21",
"volcengine",
"codestral",
"deepseek",
"deepinfra",
"perplexity",
"xinference",
"xai",
"together_ai",
"fireworks_ai",
"empower",
"friendliai",
"azure_ai",
"github",
"litellm_proxy",
"hosted_vllm",
"llamafile",
"lm_studio",
"galadriel",
"github_copilot", # GitHub Copilot Chat API
"novita",
"meta_llama",
"featherless_ai",
"nscale",
"nebius",
"dashscope",
"moonshot",
"v0",
"morph",
"lambda_ai",
"hyperbolic",
]
openai_text_completion_compatible_providers: List = (
[ # providers that support `/v1/completions`
"together_ai",
"fireworks_ai",
"hosted_vllm",
"meta_llama",
"llamafile",
"featherless_ai",
"nebius",
"dashscope",
"moonshot",
"v0",
"lambda_ai",
"hyperbolic",
]
)
_openai_like_providers: List = [
"predibase",
"databricks",
"watsonx",
] # private helper. similar to openai but require some custom auth / endpoint handling, so can't use the openai sdk
# well supported replicate llms
replicate_models: List = [
# llama replicate supported LLMs
"replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf",
"a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52",
"meta/codellama-13b:1c914d844307b0588599b8393480a3ba917b660c7e9dfae681542b5325f228db",
# Vicuna
"replicate/vicuna-13b:6282abe6a492de4145d7bb601023762212f9ddbbe78278bd6771c8b3b2f2a13b",
"joehoover/instructblip-vicuna13b:c4c54e3c8c97cd50c2d2fec9be3b6065563ccf7d43787fb99f84151b867178fe",
# Flan T-5
"daanelson/flan-t5-large:ce962b3f6792a57074a601d3979db5839697add2e4e02696b3ced4c022d4767f",
# Others
"replicate/dolly-v2-12b:ef0e1aefc61f8e096ebe4db6b2bacc297daf2ef6899f0f7e001ec445893500e5",
"replit/replit-code-v1-3b:b84f4c074b807211cd75e3e8b1589b6399052125b4c27106e43d47189e8415ad",
]
clarifai_models: List = [
"clarifai/meta.Llama-3.Llama-3-8B-Instruct",
"clarifai/gcp.generate.gemma-1_1-7b-it",
"clarifai/mistralai.completion.mixtral-8x22B",
"clarifai/cohere.generate.command-r-plus",
"clarifai/databricks.drbx.dbrx-instruct",
"clarifai/mistralai.completion.mistral-large",
"clarifai/mistralai.completion.mistral-medium",
"clarifai/mistralai.completion.mistral-small",
"clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1",
"clarifai/gcp.generate.gemma-2b-it",
"clarifai/gcp.generate.gemma-7b-it",
"clarifai/deci.decilm.deciLM-7B-instruct",
"clarifai/mistralai.completion.mistral-7B-Instruct",
"clarifai/gcp.generate.gemini-pro",
"clarifai/anthropic.completion.claude-v1",
"clarifai/anthropic.completion.claude-instant-1_2",
"clarifai/anthropic.completion.claude-instant",
"clarifai/anthropic.completion.claude-v2",
"clarifai/anthropic.completion.claude-2_1",
"clarifai/meta.Llama-2.codeLlama-70b-Python",
"clarifai/meta.Llama-2.codeLlama-70b-Instruct",
"clarifai/openai.completion.gpt-3_5-turbo-instruct",
"clarifai/meta.Llama-2.llama2-7b-chat",
"clarifai/meta.Llama-2.llama2-13b-chat",
"clarifai/meta.Llama-2.llama2-70b-chat",
"clarifai/openai.chat-completion.gpt-4-turbo",
"clarifai/microsoft.text-generation.phi-2",
"clarifai/meta.Llama-2.llama2-7b-chat-vllm",
"clarifai/upstage.solar.solar-10_7b-instruct",
"clarifai/openchat.openchat.openchat-3_5-1210",
"clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B",
"clarifai/gcp.generate.text-bison",
"clarifai/meta.Llama-2.llamaGuard-7b",
"clarifai/fblgit.una-cybertron.una-cybertron-7b-v2",
"clarifai/openai.chat-completion.GPT-4",
"clarifai/openai.chat-completion.GPT-3_5-turbo",
"clarifai/ai21.complete.Jurassic2-Grande",
"clarifai/ai21.complete.Jurassic2-Grande-Instruct",
"clarifai/ai21.complete.Jurassic2-Jumbo-Instruct",
"clarifai/ai21.complete.Jurassic2-Jumbo",
"clarifai/ai21.complete.Jurassic2-Large",
"clarifai/cohere.generate.cohere-generate-command",
"clarifai/wizardlm.generate.wizardCoder-Python-34B",
"clarifai/wizardlm.generate.wizardLM-70B",
"clarifai/tiiuae.falcon.falcon-40b-instruct",
"clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat",
"clarifai/gcp.generate.code-gecko",
"clarifai/gcp.generate.code-bison",
"clarifai/mistralai.completion.mistral-7B-OpenOrca",
"clarifai/mistralai.completion.openHermes-2-mistral-7B",
"clarifai/wizardlm.generate.wizardLM-13B",
"clarifai/huggingface-research.zephyr.zephyr-7B-alpha",
"clarifai/wizardlm.generate.wizardCoder-15B",
"clarifai/microsoft.text-generation.phi-1_5",
"clarifai/databricks.Dolly-v2.dolly-v2-12b",
"clarifai/bigcode.code.StarCoder",
"clarifai/salesforce.xgen.xgen-7b-8k-instruct",
"clarifai/mosaicml.mpt.mpt-7b-instruct",
"clarifai/anthropic.completion.claude-3-opus",
"clarifai/anthropic.completion.claude-3-sonnet",
"clarifai/gcp.generate.gemini-1_5-pro",
"clarifai/gcp.generate.imagen-2",
"clarifai/salesforce.blip.general-english-image-caption-blip-2",
]
huggingface_models: List = [
"meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-2-7b-chat-hf",
"meta-llama/Llama-2-13b-hf",
"meta-llama/Llama-2-13b-chat-hf",
"meta-llama/Llama-2-70b-hf",
"meta-llama/Llama-2-70b-chat-hf",
"meta-llama/Llama-2-7b",
"meta-llama/Llama-2-7b-chat",
"meta-llama/Llama-2-13b",
"meta-llama/Llama-2-13b-chat",
"meta-llama/Llama-2-70b",
"meta-llama/Llama-2-70b-chat",
] # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/providers
empower_models = [
"empower/empower-functions",
"empower/empower-functions-small",
]
together_ai_models: List = [
# llama llms - chat
"togethercomputer/llama-2-70b-chat",
# llama llms - language / instruct
"togethercomputer/llama-2-70b",
"togethercomputer/LLaMA-2-7B-32K",
"togethercomputer/Llama-2-7B-32K-Instruct",
"togethercomputer/llama-2-7b",
# falcon llms
"togethercomputer/falcon-40b-instruct",
"togethercomputer/falcon-7b-instruct",
# alpaca
"togethercomputer/alpaca-7b",
# chat llms
"HuggingFaceH4/starchat-alpha",
# code llms
"togethercomputer/CodeLlama-34b",
"togethercomputer/CodeLlama-34b-Instruct",
"togethercomputer/CodeLlama-34b-Python",
"defog/sqlcoder",
"NumbersStation/nsql-llama-2-7B",
"WizardLM/WizardCoder-15B-V1.0",
"WizardLM/WizardCoder-Python-34B-V1.0",
# language llms
"NousResearch/Nous-Hermes-Llama2-13b",
"Austism/chronos-hermes-13b",
"upstage/SOLAR-0-70b-16bit",
"WizardLM/WizardLM-70B-V1.0",
] # supports all together ai models, just pass in the model id e.g. completion(model="together_computer/replit_code_3b",...)
baseten_models: List = [
"qvv0xeq",
"q841o8w",
"31dxrj3",
] # FALCON 7B # WizardLM # Mosaic ML
featherless_ai_models: List = [
"featherless-ai/Qwerky-72B",
"featherless-ai/Qwerky-QwQ-32B",
"Qwen/Qwen2.5-72B-Instruct",
"all-hands/openhands-lm-32b-v0.1",
"Qwen/Qwen2.5-Coder-32B-Instruct",
"deepseek-ai/DeepSeek-V3-0324",
"mistralai/Mistral-Small-24B-Instruct-2501",
"mistralai/Mistral-Nemo-Instruct-2407",
"ProdeusUnity/Stellar-Odyssey-12b-v0.0",
]
nebius_models: List = [
"Qwen/Qwen3-235B-A22B",
"Qwen/Qwen3-30B-A3B-fast",
"Qwen/Qwen3-32B",
"Qwen/Qwen3-14B",
"nvidia/Llama-3_1-Nemotron-Ultra-253B-v1",
"deepseek-ai/DeepSeek-V3-0324",
"deepseek-ai/DeepSeek-V3-0324-fast",
"deepseek-ai/DeepSeek-R1",
"deepseek-ai/DeepSeek-R1-fast",
"meta-llama/Llama-3.3-70B-Instruct-fast",
"Qwen/Qwen2.5-32B-Instruct-fast",
"Qwen/Qwen2.5-Coder-32B-Instruct-fast",
]
dashscope_models: List = [
"qwen-turbo",
"qwen-plus",
"qwen-max",
"qwen-turbo-latest",
"qwen-plus-latest",
"qwen-max-latest",
"qwq-32b",
"qwen3-235b-a22b",
"qwen3-32b",
"qwen3-30b-a3b",
]
nebius_embedding_models: List = [
"BAAI/bge-en-icl",
"BAAI/bge-multilingual-gemma2",
"intfloat/e5-mistral-7b-instruct",
]
BEDROCK_INVOKE_PROVIDERS_LITERAL = Literal[
"cohere",
"anthropic",
"mistral",
"amazon",
"meta",
"llama",
"ai21",
"nova",
"deepseek_r1",
]
open_ai_embedding_models: List = ["text-embedding-ada-002"]
cohere_embedding_models: List = [
"embed-v4.0",
"embed-english-v3.0",
"embed-english-light-v3.0",
"embed-multilingual-v3.0",
"embed-english-v2.0",
"embed-english-light-v2.0",
"embed-multilingual-v2.0",
]
bedrock_embedding_models: List = [
"amazon.titan-embed-text-v1",
"cohere.embed-english-v3",
"cohere.embed-multilingual-v3",
]
known_tokenizer_config = {
"mistralai/Mistral-7B-Instruct-v0.1": {
"tokenizer": {
"chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
"bos_token": "<s>",
"eos_token": "</s>",
},
"status": "success",
},
"meta-llama/Meta-Llama-3-8B-Instruct": {
"tokenizer": {
"chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
"bos_token": "<|begin_of_text|>",
"eos_token": "",
},
"status": "success",
},
"deepseek-r1/deepseek-r1-7b-instruct": {
"tokenizer": {
"add_bos_token": True,
"add_eos_token": False,
"bos_token": {
"__type": "AddedToken",
"content": "<begin▁of▁sentence>",
"lstrip": False,
"normalized": True,
"rstrip": False,
"single_word": False,
},
"clean_up_tokenization_spaces": False,
"eos_token": {
"__type": "AddedToken",
"content": "<end▁of▁sentence>",
"lstrip": False,
"normalized": True,
"rstrip": False,
"single_word": False,
},
"legacy": True,
"model_max_length": 16384,
"pad_token": {
"__type": "AddedToken",
"content": "<end▁of▁sentence>",
"lstrip": False,
"normalized": True,
"rstrip": False,
"single_word": False,
},
"sp_model_kwargs": {},
"unk_token": None,
"tokenizer_class": "LlamaTokenizerFast",
"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<User>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<Assistant><tool▁calls▁begin><tool▁call▁begin>' + tool['type'] + '<tool▁sep>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<tool▁call▁end>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<tool▁call▁begin>' + tool['type'] + '<tool▁sep>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<tool▁call▁end>'}}{{'<tool▁calls▁end><end▁of▁sentence>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<tool▁outputs▁end>' + message['content'] + '<end▁of▁sentence>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<Assistant>' + content + '<end▁of▁sentence>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<tool▁outputs▁begin><tool▁output▁begin>' + message['content'] + '<tool▁output▁end>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<tool▁output▁begin>' + message['content'] + '<tool▁output▁end>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<tool▁outputs▁end>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<Assistant><think>\\n'}}{% endif %}",
},
"status": "success",
},
}
OPENAI_FINISH_REASONS = ["stop", "length", "function_call", "content_filter", "null"]
HUMANLOOP_PROMPT_CACHE_TTL_SECONDS = int(
os.getenv("HUMANLOOP_PROMPT_CACHE_TTL_SECONDS", 60)
) # 1 minute
RESPONSE_FORMAT_TOOL_NAME = "json_tool_call" # default tool name used when converting response format to tool call
########################### Logging Callback Constants ###########################
AZURE_STORAGE_MSFT_VERSION = "2019-07-07"
PROMETHEUS_BUDGET_METRICS_REFRESH_INTERVAL_MINUTES = int(
os.getenv("PROMETHEUS_BUDGET_METRICS_REFRESH_INTERVAL_MINUTES", 5)
)
MCP_TOOL_NAME_PREFIX = "mcp_tool"
MAXIMUM_TRACEBACK_LINES_TO_LOG = int(os.getenv("MAXIMUM_TRACEBACK_LINES_TO_LOG", 100))
# Headers to control callbacks
X_LITELLM_DISABLE_CALLBACKS = "x-litellm-disable-callbacks"
LITELLM_METADATA_FIELD = "litellm_metadata"
OLD_LITELLM_METADATA_FIELD = "metadata"
########################### LiteLLM Proxy Specific Constants ###########################
########################################################################################
MAX_SPENDLOG_ROWS_TO_QUERY = int(
os.getenv("MAX_SPENDLOG_ROWS_TO_QUERY", 1_000_000)
) # if spendLogs has more than 1M rows, do not query the DB
DEFAULT_SOFT_BUDGET = float(
os.getenv("DEFAULT_SOFT_BUDGET", 50.0)
) # by default all litellm proxy keys have a soft budget of 50.0
# makes it clear this is a rate limit error for a litellm virtual key
RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash"
# pass through route constansts
BEDROCK_AGENT_RUNTIME_PASS_THROUGH_ROUTES = [
"agents/",
"knowledgebases/",
"flows/",
"retrieveAndGenerate/",
"rerank/",
"generateQuery/",
"optimize-prompt/",
]
BASE_MCP_ROUTE = "/mcp"
BATCH_STATUS_POLL_INTERVAL_SECONDS = int(
os.getenv("BATCH_STATUS_POLL_INTERVAL_SECONDS", 3600)
) # 1 hour
BATCH_STATUS_POLL_MAX_ATTEMPTS = int(
os.getenv("BATCH_STATUS_POLL_MAX_ATTEMPTS", 24)
) # for 24 hours
HEALTH_CHECK_TIMEOUT_SECONDS = int(
os.getenv("HEALTH_CHECK_TIMEOUT_SECONDS", 60)
) # 60 seconds
LITTELM_INTERNAL_HEALTH_SERVICE_ACCOUNT_NAME = "litellm-internal-health-check"
UI_SESSION_TOKEN_TEAM_ID = "litellm-dashboard"
LITELLM_PROXY_ADMIN_NAME = "default_user_id"
########################### CLI SSO AUTHENTICATION CONSTANTS ###########################
LITELLM_CLI_SOURCE_IDENTIFIER = "litellm-cli"
LITELLM_CLI_SESSION_TOKEN_PREFIX = "litellm-session-token"
########################### DB CRON JOB NAMES ###########################
DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job"
PROMETHEUS_EMIT_BUDGET_METRICS_JOB_NAME = "prometheus_emit_budget_metrics"
SPEND_LOG_CLEANUP_JOB_NAME = "spend_log_cleanup"
SPEND_LOG_RUN_LOOPS = int(os.getenv("SPEND_LOG_RUN_LOOPS", 500))
SPEND_LOG_CLEANUP_BATCH_SIZE = int(os.getenv("SPEND_LOG_CLEANUP_BATCH_SIZE", 1000))
DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = int(
os.getenv("DEFAULT_CRON_JOB_LOCK_TTL_SECONDS", 60)
) # 1 minute
PROXY_BUDGET_RESCHEDULER_MIN_TIME = int(
os.getenv("PROXY_BUDGET_RESCHEDULER_MIN_TIME", 597)
)
PROXY_BATCH_POLLING_INTERVAL = int(os.getenv("PROXY_BATCH_POLLING_INTERVAL", 3600))
PROXY_BUDGET_RESCHEDULER_MAX_TIME = int(
os.getenv("PROXY_BUDGET_RESCHEDULER_MAX_TIME", 605)
)
PROXY_BATCH_WRITE_AT = int(os.getenv("PROXY_BATCH_WRITE_AT", 10)) # in seconds
DEFAULT_HEALTH_CHECK_INTERVAL = int(
os.getenv("DEFAULT_HEALTH_CHECK_INTERVAL", 300)
) # 5 minutes
PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS = int(
os.getenv("PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS", 9)
)
DEFAULT_MODEL_CREATED_AT_TIME = int(
os.getenv("DEFAULT_MODEL_CREATED_AT_TIME", 1677610602)
) # returns on `/models` endpoint
DEFAULT_SLACK_ALERTING_THRESHOLD = int(
os.getenv("DEFAULT_SLACK_ALERTING_THRESHOLD", 300)
)
MAX_TEAM_LIST_LIMIT = int(os.getenv("MAX_TEAM_LIST_LIMIT", 20))
DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD = float(
os.getenv("DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD", 0.7)
)
LENGTH_OF_LITELLM_GENERATED_KEY = int(os.getenv("LENGTH_OF_LITELLM_GENERATED_KEY", 16))
SECRET_MANAGER_REFRESH_INTERVAL = int(
os.getenv("SECRET_MANAGER_REFRESH_INTERVAL", 86400)
)
LITELLM_SETTINGS_SAFE_DB_OVERRIDES = [
"default_internal_user_params",
"public_model_groups",
"public_model_groups_links",
]
SPECIAL_LITELLM_AUTH_TOKEN = ["ui-token"]
DEFAULT_MANAGEMENT_OBJECT_IN_MEMORY_CACHE_TTL = int(
os.getenv("DEFAULT_MANAGEMENT_OBJECT_IN_MEMORY_CACHE_TTL", 60)
)
# Sentry Scrubbing Configuration
SENTRY_DENYLIST = [
# API Keys and Tokens
"api_key",
"token",
"key",
"secret",
"password",
"auth",
"credential",
"OPENAI_API_KEY",
"ANTHROPIC_API_KEY",
"AZURE_API_KEY",
"COHERE_API_KEY",
"REPLICATE_API_KEY",
"HUGGINGFACE_API_KEY",
"TOGETHERAI_API_KEY",
"CLOUDFLARE_API_KEY",
"BASETEN_KEY",
"OPENROUTER_KEY",
"DATAROBOT_API_TOKEN",
"FIREWORKS_API_KEY",
"FIREWORKS_AI_API_KEY",
"FIREWORKSAI_API_KEY",
# Database and Connection Strings
"database_url",
"redis_url",
"connection_string",
# Authentication and Security
"master_key",
"LITELLM_MASTER_KEY",
"auth_token",
"jwt_token",
"private_key",
"SLACK_WEBHOOK_URL",
"webhook_url",
"LANGFUSE_SECRET_KEY",
# Email Configuration
"SMTP_PASSWORD",
"SMTP_USERNAME",
"email_password",
# Cloud Provider Credentials
"aws_access_key",
"aws_secret_key",
"gcp_credentials",
"azure_credentials",
"HCP_VAULT_TOKEN",
"CIRCLE_OIDC_TOKEN",
# Proxy and Environment Settings
"proxy_url",
"proxy_key",
"environment_variables",
]
SENTRY_PII_DENYLIST = [
"user_id",
"email",
"phone",
"address",
"ip_address",
"SMTP_SENDER_EMAIL",
"TEST_EMAIL_ADDRESS",
]