Merge pull request #24 from AgentOps-AI/23-update-pricing-to-decimals

areibman · web-flow · commit 7959de29f882 · 2024-01-30T22:26:12.000-08:00
23 update pricing to decimals
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1 +1 @@
-include model_prices.yaml
+include model_prices.json
diff --git a/README.md b/README.md
@@ -25,8 +25,7 @@ prompt_cost = calculate_prompt_cost(prompt, model)
 completion_cost = calculate_completion_cost(completion, model)
 
 print(f"{prompt_cost} + {completion_cost} = {prompt_cost + completion_cost}")
-# 135 + 140 = 275 ($0.0000275)
-# Priced in TPUs (token price units), which is 1/100,000,000th of a USD.
+# 0.0000135 + 0.000014 = 0.0000275
 ```
 
 ## Installation
@@ -58,23 +57,19 @@ completion = chat_completion.choices[0].message.content
 prompt_cost = calculate_prompt_cost(prompt, model)
 completion_cost = calculate_completion_cost(completion, model)
 print(f"{prompt_cost} + {completion_cost} = {prompt_cost + completion_cost}")
-# 1800 + 1000 = 2800 ($0.0000280)
-
-from tokencost import USD_PER_TPU
-print(f"Cost USD: ${(prompt_cost + completion_cost)/USD_PER_TPU}")
-# $2.8e-05
+# 0.0000180 + 0.000010 = 0.0000280
 ```
 
 **Calculating cost using string prompts instead of messages:**
 ```python
-from tokencost import calculate_prompt_cost, USD_PER_TPU
+from tokencost import calculate_prompt_cost
 
 prompt_string = "Hello world" 
 response = "How may I assist you today?"
 model= "gpt-3.5-turbo"
 
 prompt_cost = calculate_prompt_cost(prompt_string, model)
-print(f"Cost: ${prompt_cost/USD_PER_TPU}")
+print(f"Cost: ${prompt_cost}")
 # Cost: $3e-06
 ```
 
@@ -95,7 +90,11 @@ print(count_string_tokens(prompt="Hello world", model="gpt-3.5-turbo"))
 ```
 
 ## Cost table
-Units denominated in TPUs (Token Price Units = 1/10,000,000 USD). All prices can be located in `model_prices.yaml`.
+Units denominated in USD. All prices can be located in `model_prices.json`.
+
+
+* Prices last updated Jan 30, 2024 from: https://openai.com/pricing and https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
+
 
 | Model Name | Prompt Cost (USD) | Completion Cost (USD) | Max Prompt Tokens |
 | --- | --- | --- | --- |
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,11 +6,11 @@ build-backend = "setuptools.build_meta"
 include-package-data = true
 
 [tool.setuptools.package-data]
-tokencost = ["model_prices.yaml"]
+tokencost = ["model_prices.json"]
 
 [project]
 name = "tokencost"
-version = "0.0.6"
+version = "0.1.0"
 authors = [
   { name = "Trisha Pan", email = "trishaepan@gmail.com" },
   { name = "Alex Reibman", email = "areibman@gmail.com" },
@@ -24,8 +24,7 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 dependencies = [
-    "tiktoken>=0.5.2",
-    "pyyaml>=6.0.1"
+    "tiktoken>=0.5.2"
 ]
 
 [project.optional-dependencies]
diff --git a/tests/test_costs.py b/tests/test_costs.py
@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 
 import pytest
+from decimal import Decimal
 from tokencost.costs import (
     count_message_tokens,
     count_string_tokens,
@@ -129,21 +130,21 @@ def test_count_string_invalid_model():
 @pytest.mark.parametrize(
     "prompt,model,expected_output",
     [
-        (MESSAGES, "gpt-3.5-turbo", 2250),
-        (MESSAGES, "gpt-3.5-turbo-0301", 2550),
-        (MESSAGES, "gpt-3.5-turbo-0613", 2250),
-        (MESSAGES, "gpt-3.5-turbo-16k", 4500),
-        (MESSAGES, "gpt-3.5-turbo-16k-0613", 4500),
-        (MESSAGES, "gpt-3.5-turbo-1106", 750),
-        (MESSAGES, "gpt-3.5-turbo-instruct", 2250),
-        (MESSAGES, "gpt-4", 45000),
-        (MESSAGES, "gpt-4-0314", 45000),
-        (MESSAGES, "gpt-4-32k", 90000),
-        (MESSAGES, "gpt-4-32k-0314", 90000),
-        (MESSAGES, "gpt-4-0613", 45000),
-        (MESSAGES, "gpt-4-1106-preview", 15000),
-        (MESSAGES, "gpt-4-vision-preview", 15000),
-        (STRING, "text-embedding-ada-002", 40),
+        (MESSAGES, "gpt-3.5-turbo", Decimal('0.0000225')),
+        (MESSAGES, "gpt-3.5-turbo-0301", Decimal('0.0000255')),
+        (MESSAGES, "gpt-3.5-turbo-0613", Decimal('0.0000225')),
+        (MESSAGES, "gpt-3.5-turbo-16k", Decimal('0.000045')),
+        (MESSAGES, "gpt-3.5-turbo-16k-0613", Decimal('0.000045')),
+        (MESSAGES, "gpt-3.5-turbo-1106", Decimal('0.000015')),
+        (MESSAGES, "gpt-3.5-turbo-instruct", Decimal('0.0000225')),
+        (MESSAGES, "gpt-4", Decimal('0.00045')),
+        (MESSAGES, "gpt-4-0314", Decimal('0.00045')),
+        (MESSAGES, "gpt-4-32k", Decimal('0.00090')),
+        (MESSAGES, "gpt-4-32k-0314", Decimal('0.00090')),
+        (MESSAGES, "gpt-4-0613", Decimal('0.00045')),
+        (MESSAGES, "gpt-4-1106-preview", Decimal('0.00015')),
+        (MESSAGES, "gpt-4-vision-preview", Decimal('0.00015')),
+        (STRING, "text-embedding-ada-002", Decimal('0.0000004')),
     ],
 )
 def test_calculate_prompt_cost(prompt, model, expected_output):
@@ -163,20 +164,20 @@ def test_invalid_prompt_format():
 @pytest.mark.parametrize(
     "prompt,model,expected_output",
     [
-        (STRING, "gpt-3.5-turbo", 800),
-        (STRING, "gpt-3.5-turbo-0301", 800),
-        (STRING, "gpt-3.5-turbo-0613", 800),
-        (STRING, "gpt-3.5-turbo-16k", 1600),
-        (STRING, "gpt-3.5-turbo-16k-0613", 1600),
-        (STRING, "gpt-3.5-turbo-1106", 600),
-        (STRING, "gpt-3.5-turbo-instruct", 800),
-        (STRING, "gpt-4", 24000),
-        (STRING, "gpt-4-0314", 24000),
-        (STRING, "gpt-4-32k", 48000),
-        (STRING, "gpt-4-32k-0314", 48000),
-        (STRING, "gpt-4-0613", 24000),
-        (STRING, "gpt-4-1106-preview", 12000),
-        (STRING, "gpt-4-vision-preview", 12000),
+        (STRING, "gpt-3.5-turbo", Decimal('0.000008')),
+        (STRING, "gpt-3.5-turbo-0301", Decimal('0.000008')),
+        (STRING, "gpt-3.5-turbo-0613", Decimal('0.000008')),
+        (STRING, "gpt-3.5-turbo-16k", Decimal('0.000016')),
+        (STRING, "gpt-3.5-turbo-16k-0613", Decimal('0.000016')),
+        (STRING, "gpt-3.5-turbo-1106", Decimal('0.000008')),
+        (STRING, "gpt-3.5-turbo-instruct", Decimal('0.000008')),
+        (STRING, "gpt-4", Decimal('0.00024')),
+        (STRING, "gpt-4-0314", Decimal('0.00024')),
+        (STRING, "gpt-4-32k", Decimal('0.00048')),
+        (STRING, "gpt-4-32k-0314", Decimal('0.00048')),
+        (STRING, "gpt-4-0613", Decimal('0.00024')),
+        (STRING, "gpt-4-1106-preview", Decimal('0.00012')),
+        (STRING, "gpt-4-vision-preview", Decimal('0.00012')),
         (STRING, "text-embedding-ada-002", 0),
     ],
 )
diff --git a/tests/test_llama_index_callbacks.py b/tests/test_llama_index_callbacks.py
@@ -5,20 +5,12 @@
 from unittest.mock import MagicMock
 
 # Mock the calculate_prompt_cost and calculate_completion_cost functions
-# and the USD_PER_TPU constant
 
+# 4 tokens
 STRING = "Hello, world!"
 
 
-@pytest.fixture
-def mock_tokencost(monkeypatch):
-    monkeypatch.setattr('tokencost.calculate_prompt_cost', MagicMock(return_value=100))
-    monkeypatch.setattr('tokencost.calculate_completion_cost', MagicMock(return_value=200))
-    monkeypatch.setattr('tokencost.USD_PER_TPU', 10)
-
-# Mock the ChatMessage class
-
-
+# Mock the ChatMessage class in LlamaIndex
 @pytest.fixture
 def mock_chat_message(monkeypatch):
     class MockChatMessage:
@@ -34,30 +26,30 @@ def __str__(self):
 # Test the _calc_llm_event_cost method for prompt and completion
 
 
-def test_calc_llm_event_cost_prompt_completion(mock_tokencost, capsys):
+def test_calc_llm_event_cost_prompt_completion(capsys):
     handler = llama_index.TokenCostHandler(model='gpt-3.5-turbo')
     payload = {
         EventPayload.PROMPT: STRING,
         EventPayload.COMPLETION: STRING
     }
     handler._calc_llm_event_cost(payload)
     captured = capsys.readouterr()
-    assert "# Prompt cost: 6e-06" in captured.out
-    assert "# Completion: 8e-06" in captured.out
+    assert "# Prompt cost: 0.0000060" in captured.out
+    assert "# Completion: 0.000008" in captured.out
 
 # Test the _calc_llm_event_cost method for messages and response
 
 
-def test_calc_llm_event_cost_messages_response(mock_tokencost, mock_chat_message, capsys):
+def test_calc_llm_event_cost_messages_response(mock_chat_message, capsys):
     handler = llama_index.TokenCostHandler(model='gpt-3.5-turbo')
     payload = {
         EventPayload.MESSAGES: [mock_chat_message("message 1"), mock_chat_message("message 2")],
         EventPayload.RESPONSE: "test response"
     }
     handler._calc_llm_event_cost(payload)
     captured = capsys.readouterr()
-    assert "# Prompt cost: 1.05e-05" in captured.out
-    assert "# Completion: 4e-06" in captured.out
+    assert "# Prompt cost: 0.0000105" in captured.out
+    assert "# Completion: 0.000004" in captured.out
 
 # Additional tests can be written for start_trace, end_trace, on_event_start, and on_event_end
 # depending on the specific logic and requirements of those methods.
diff --git a/tokencost/__init__.py b/tokencost/__init__.py
@@ -4,4 +4,4 @@
     calculate_completion_cost,
     calculate_prompt_cost,
 )
-from .constants import TOKEN_COSTS, USD_PER_TPU
+from .constants import TOKEN_COSTS
diff --git a/tokencost/callbacks/llama_index.py b/tokencost/callbacks/llama_index.py
@@ -1,7 +1,7 @@
 from typing import Any, Dict, List, Optional, cast
 from llama_index.callbacks.base_handler import BaseCallbackHandler
 from llama_index.callbacks.schema import CBEventType, EventPayload
-from tokencost import calculate_prompt_cost, calculate_completion_cost, USD_PER_TPU
+from tokencost import calculate_prompt_cost, calculate_completion_cost
 
 
 class TokenCostHandler(BaseCallbackHandler):
@@ -29,15 +29,15 @@ def _calc_llm_event_cost(self, payload: dict) -> None:
         if EventPayload.PROMPT in payload:
             prompt = str(payload.get(EventPayload.PROMPT))
             completion = str(payload.get(EventPayload.COMPLETION))
-            prompt_cost = calculate_prompt_cost(prompt, self.model) / USD_PER_TPU
-            completion_cost = calculate_completion_cost(completion, self.model) / USD_PER_TPU
+            prompt_cost = calculate_prompt_cost(prompt, self.model)
+            completion_cost = calculate_completion_cost(completion, self.model)
 
         elif EventPayload.MESSAGES in payload:
             messages = cast(List[ChatMessage], payload.get(EventPayload.MESSAGES, []))
             messages_str = "\n".join([str(x) for x in messages])
-            prompt_cost = calculate_prompt_cost(messages_str, self.model) / USD_PER_TPU
+            prompt_cost = calculate_prompt_cost(messages_str, self.model)
             response = str(payload.get(EventPayload.RESPONSE))
-            completion_cost = calculate_completion_cost(response, self.model) / USD_PER_TPU
+            completion_cost = calculate_completion_cost(response, self.model)
 
         print(f"# Prompt cost: {prompt_cost}")
         print(f"# Completion: {completion_cost}")
diff --git a/tokencost/constants.py b/tokencost/constants.py
@@ -1,5 +1,7 @@
 import os
-import yaml
+import json
+from urllib.request import urlopen
+
 """
 Prompt (aka context) tokens are based on number of words + other chars (eg spaces and punctuation) in input.
 Completion tokens are similarly based on how long chatGPT's response is.
@@ -11,16 +13,23 @@
 
 Note: When asking follow-up questions, everything above and including your follow-up question
 is considered a prompt (for the purpose of context) and will thus cost prompt tokens.
-
-1 Token Price Unit (TPU) is defined as 1/100,000,000 of $1 (USD). 1,000,000 TPUs would equate to $0.01.
 """
 
-USD_PER_TPU = 100_000_000
-
 # How to read TOKEN_COSTS:
-# Each prompt token costs __ TPUs per token.
-# Each completion token costs __ TPUs per token.
+# Each prompt token costs __ USD per token.
+# Each completion token costs __ USD per token.
 # Max prompt limit of each model is __ tokens.
 
-with open(os.path.join(os.path.dirname(__file__), "model_prices.yaml"), "r") as f:
-    TOKEN_COSTS = yaml.safe_load(f)
+# Fetch the latest prices using urllib.request
+PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
+
+try:
+    with urlopen(PRICES_URL) as response:
+        if response.status == 200:
+            TOKEN_COSTS = json.loads(response.read())
+        else:
+            raise Exception("Failed to fetch token costs, status code: {}".format(response.status))
+except Exception:
+    # If fetching fails, use the local model_prices.json as a fallback
+    with open(os.path.join(os.path.dirname(__file__), "model_prices.json"), "r") as f:
+        TOKEN_COSTS = json.load(f)
diff --git a/tokencost/costs.py b/tokencost/costs.py
@@ -4,6 +4,7 @@
 import tiktoken
 from typing import Union, List, Dict
 from .constants import TOKEN_COSTS
+from decimal import Decimal
 
 
 # TODO: Add Claude support
@@ -90,27 +91,26 @@ def count_string_tokens(prompt: str, model: str) -> int:
     return len(encoding.encode(prompt))
 
 
-def calculate_prompt_cost(prompt: Union[List[dict], str], model: str) -> int:
+def calculate_prompt_cost(prompt: Union[List[dict], str], model: str) -> Decimal:
     """
-    Calculate the prompt's cost in token price units (TPU). 1 TPU = $1/10,000,000.
-    e.g. 100,000 TPUs = $0.01.
+    Calculate the prompt's cost in USD.
 
     Args:
         prompt (Union[List[dict], str]): List of message objects or single string prompt.
         model (str): The model name.
 
     Returns:
-        int: The calculated cost in TPUs.
+        Decimal: The calculated cost in USD.
 
     e.g.:
     >>> prompt = [{ "role": "user", "content": "Hello world"},
                   { "role": "assistant", "content": "How may I assist you today?"}]
     >>>calculate_prompt_cost(prompt, "gpt-3.5-turbo")
-    300
+    Decimal('0.0000300')
     # or
     >>> prompt = "Hello world"
     >>> calculate_prompt_cost(prompt, "gpt-3.5-turbo")
-    30
+    Decimal('0.0000030')
     """
     model = model.lower()
     if model not in TOKEN_COSTS:
@@ -129,34 +129,32 @@ def calculate_prompt_cost(prompt: Union[List[dict], str], model: str) -> int:
         if isinstance(prompt, str)
         else count_message_tokens(prompt, model)
     )
-    prompt_cost = TOKEN_COSTS[model]["prompt"]
+    prompt_cost = TOKEN_COSTS[model]["input_cost_per_token"]
+    return Decimal(str(prompt_cost)) * Decimal(prompt_tokens)
 
-    return prompt_cost * prompt_tokens
 
-
-def calculate_completion_cost(completion: str, model: str) -> int:
+def calculate_completion_cost(completion: str, model: str) -> Decimal:
     """
-    Calculate the prompt's cost in token price units (TPU). 1 TPU = $1/10,000,000.
-    e.g. 100,000 TPUs = $0.01.
+    Calculate the prompt's cost in USD.
 
     Args:
         completion (str): Completion string.
         model (str): The model name.
 
     Returns:
-        int: The calculated cost in TPUs.
+        Decimal: The calculated cost in USD.
 
     e.g.:
     >>> completion = "How may I assist you today?"
     >>> calculate_completion_cost(completion, "gpt-3.5-turbo")
-    140
+    Decimal('0.000014')
     """
     if model not in TOKEN_COSTS:
         raise KeyError(
             f"""Model {model} is not implemented.
             Double-check your spelling, or submit an issue/PR"""
         )
     completion_tokens = count_string_tokens(completion, model)
-    completion_cost = TOKEN_COSTS[model]["completion"]
+    completion_cost = TOKEN_COSTS[model]["output_cost_per_token"]
 
-    return completion_cost * completion_tokens
+    return Decimal(str(completion_cost)) * Decimal(completion_tokens)
diff --git a/tokencost/model_prices.json b/tokencost/model_prices.json
diff --git a/tokencost/model_prices.yaml b/tokencost/model_prices.yaml

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-include model_prices.yaml`
	`1`	`+include model_prices.json`
Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,4 @@`
`4`	`4`	`calculate_completion_cost,`
`5`	`5`	`calculate_prompt_cost,`
`6`	`6`	`)`
`7`		`-from .constants import TOKEN_COSTS, USD_PER_TPU`
	`7`	`+from .constants import TOKEN_COSTS`