Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ on:
env:
REPO_ID: lmstudio-community/Qwen3.5-0.8B-GGUF
MODEL_FILE: Qwen3.5-0.8B-Q8_0.gguf
RECURRENT_REPO_ID: QuantFactory/mamba-130m-hf-GGUF
RECURRENT_MODEL_FILE: mamba-130m-hf.Q2_K.gguf
HYBRID_REPO_ID: tiiuae/Falcon-H1-Tiny-90M-Instruct-GGUF
HYBRID_MODEL_FILE: Falcon-H1-Tiny-90M-Instruct-Q2_K.gguf
MODEL_CACHE_KEY: qwen35-q8-mamba130m-q2-falconh1tiny-q2

jobs:
download-model:
Expand All @@ -22,12 +27,15 @@ jobs:
- name: Install huggingface-hub
run: pip install huggingface-hub
- name: Download model
run: hf download ${{ env.REPO_ID }} ${{ env.MODEL_FILE }}
run: |
hf download ${{ env.REPO_ID }} ${{ env.MODEL_FILE }}
hf download ${{ env.RECURRENT_REPO_ID }} ${{ env.RECURRENT_MODEL_FILE }}
hf download ${{ env.HYBRID_REPO_ID }} ${{ env.HYBRID_MODEL_FILE }}
- name: Cache model
uses: actions/cache@v4
with:
path: ~/.cache/huggingface/hub
key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
key: ${{ runner.os }}-model-${{ env.MODEL_CACHE_KEY }}

build-linux:
needs: download-model
Expand All @@ -49,7 +57,7 @@ jobs:
uses: actions/cache@v4
with:
path: ~/.cache/huggingface/hub
key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
key: ${{ runner.os }}-model-${{ env.MODEL_CACHE_KEY }}
- name: Install dependencies (Linux/MacOS)
run: |
python -m pip install --upgrade pip
Expand Down Expand Up @@ -81,7 +89,7 @@ jobs:
uses: actions/cache@v4
with:
path: ~/.cache/huggingface/hub
key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
key: ${{ runner.os }}-model-${{ env.MODEL_CACHE_KEY }}

- name: Install dependencies (Windows)
run: |
Expand Down Expand Up @@ -121,7 +129,7 @@ jobs:
uses: actions/cache@v4
with:
path: ~/.cache/huggingface/hub
key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
key: ${{ runner.os }}-model-${{ env.MODEL_CACHE_KEY }}

- name: Install dependencies (Linux/MacOS)
run: |
Expand Down Expand Up @@ -157,7 +165,7 @@ jobs:
uses: actions/cache@v4
with:
path: ~/.cache/huggingface/hub
key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
key: ${{ runner.os }}-model-${{ env.MODEL_CACHE_KEY }}

- name: Install dependencies
run: |
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

- fix: clear prompt for recurrent / hybrid models when only a partial prefix matches by @avion23 in #2108
- fix: match Transformers `tojson` in chat template rendering by @CISC in #1486
- fix: use env var configured multimodal library override paths when loading shared libraries by @navratil-matej in #1782
- feat: add Jinja2 loop controls to chat templates by @handshape in #2018
Expand Down
22 changes: 22 additions & 0 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,10 @@ def free_lora_adapter():

self._sampler = None

# Cache recurrent/hybrid model detection to avoid repeated FFI calls
self._is_recurrent = llama_cpp.llama_model_is_recurrent(self._model.model)
self._is_hybrid = llama_cpp.llama_model_is_hybrid(self._model.model)

@property
def ctx(self) -> llama_cpp.llama_context_p:
return self._ctx.ctx
Expand Down Expand Up @@ -644,6 +648,11 @@ def reset(self):
"""Reset the model state."""
self.n_tokens = 0

if self._is_recurrent or self._is_hybrid:
mem = llama_cpp.llama_get_memory(self._ctx.ctx)
if mem is not None:
llama_cpp.llama_memory_clear(mem, True)

def eval(self, tokens: Sequence[int]):
"""Evaluate a list of tokens.

Expand Down Expand Up @@ -899,6 +908,19 @@ def generate(
longest_prefix += 1
else:
break

# Recurrent and hybrid models cannot rewind state; reset if needed
if (
self._is_recurrent or self._is_hybrid
) and longest_prefix < self.n_tokens:
longest_prefix = 0
reset = True
if self.verbose:
print(
"Llama.generate: recurrent/hybrid model requires full state reset",
file=sys.stderr,
)

if longest_prefix > 0:
if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1):
reset = False
Expand Down
106 changes: 106 additions & 0 deletions tests/test_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,22 @@ def llama_cpp_embedding_model_path():
return model_path


@pytest.fixture
def llama_cpp_recurrent_model_path():
repo_id = "QuantFactory/mamba-130m-hf-GGUF"
filename = "mamba-130m-hf.Q2_K.gguf"
model_path = hf_hub_download(repo_id, filename)
return model_path


@pytest.fixture
def llama_cpp_hybrid_model_path():
repo_id = "tiiuae/Falcon-H1-Tiny-90M-Instruct-GGUF"
filename = "Falcon-H1-Tiny-90M-Instruct-Q2_K.gguf"
model_path = hf_hub_download(repo_id, filename)
return model_path


def test_real_model(llama_cpp_model_path):
import os

Expand Down Expand Up @@ -233,6 +249,96 @@ def logit_processor_func(input_ids, logits):
assert number_1 == number_3


def test_real_llama_repeated_prompt_cache(llama_cpp_model_path):
model = llama_cpp.Llama(
llama_cpp_model_path,
n_ctx=32,
n_batch=32,
n_ubatch=32,
n_threads=multiprocessing.cpu_count(),
n_threads_batch=multiprocessing.cpu_count(),
logits_all=False,
flash_attn=True,
verbose=False,
)
prompt = "The quick brown fox jumps over the lazy dog. The quick brown fox"

output_1 = model.create_completion(
prompt,
max_tokens=6,
temperature=0.0,
seed=1337,
)
output_2 = model.create_completion(
prompt,
max_tokens=6,
temperature=0.0,
seed=1337,
)

assert output_1["choices"][0]["text"] == " jumps over the lazy dog."
assert output_2["choices"][0]["text"] == output_1["choices"][0]["text"]


def _assert_prompt_cache_reset_handles_history_edit(
model_path,
*,
is_recurrent: bool,
is_hybrid: bool,
):
model = llama_cpp.Llama(
model_path,
n_ctx=32,
n_batch=32,
n_ubatch=32,
n_threads=multiprocessing.cpu_count(),
n_threads_batch=multiprocessing.cpu_count(),
logits_all=False,
verbose=False,
)

assert model._is_recurrent is is_recurrent
assert model._is_hybrid is is_hybrid

first_prompt = "The quick brown fox"
second_prompt = "The slow brown fox"
first_tokens = model.tokenize(first_prompt.encode(), add_bos=True, special=True)
second_tokens = model.tokenize(second_prompt.encode(), add_bos=True, special=True)

assert first_tokens != second_tokens
assert first_tokens[0] == second_tokens[0]

first_output = model.create_completion(
first_prompt,
max_tokens=1,
temperature=0.0,
)
assert isinstance(first_output["choices"][0]["text"], str)

second_output = model.create_completion(
second_prompt,
max_tokens=1,
temperature=0.0,
)
assert isinstance(second_output["choices"][0]["text"], str)


def test_recurrent_model_prompt_cache_reset(llama_cpp_recurrent_model_path):
_assert_prompt_cache_reset_handles_history_edit(
llama_cpp_recurrent_model_path,
is_recurrent=True,
is_hybrid=False,
)


def test_hybrid_model_prompt_cache_reset(llama_cpp_hybrid_model_path):
_assert_prompt_cache_reset_handles_history_edit(
llama_cpp_hybrid_model_path,
is_recurrent=False,
is_hybrid=True,
)


def test_real_llama_embeddings(llama_cpp_embedding_model_path):
model = llama_cpp.Llama(
llama_cpp_embedding_model_path,
Expand Down