mpaepper · Zetaphor · Apr 6, 2023
diff --git a/llm_agents/llama_cpp_llm.py b/llm_agents/llama_cpp_llm.py
@@ -0,0 +1,30 @@
+from pydantic import BaseModel
+from llama_cpp import Llama
+from typing import Type
+
+
+class LlamaCppLLM(BaseModel):
+    model: Type[Llama] = None
+    seed: int = 1337,
+    max_tokens: int = 256
+    n_threads: int = 4,  # Number of threads to use.
+    n_batch: int = 8,  # Maximum number of tokens to batch.
+    n_ctx: int = 512,  # Token context window.
+    top_k: int = 40,  # The top-k value to use for sampling.
+    top_p: float = 0.95,  # The top-p value to use for sampling.
+    temp: float = 0.8,  # The temperature to use for sampling.
+    repeat_penalty: float = 1.1  # The penalty to apply to repeated tokens.,
+
+    def __init__(self, model_path: str, **kwargs):
+        super().__init__(**kwargs)
+        self.model = Llama(model_path=model_path)
+
+    def generate(self, prompt: str):
+        response = self.model(prompt=prompt, max_tokens=self.max_tokens)
+        return response['choices'][0]['text']
+
+
+if __name__ == '__main__':
+    llm = LlamaCppLLM(model_path='./models/llama-7B-ggml-model-q4_0.bin')
+    result = llm.generate(prompt='Who is the president of the USA?')
+    print(result)
diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,4 @@ google-search-results>=2.4.2
 openai>=0.27.0
 pydantic>=1.10.5
 requests>=2.28.2
+llama-cpp-python>=0.1.19