From 08fb954f15f2349d6c7bf7420b783b4f7bc056e1 Mon Sep 17 00:00:00 2001 From: Ralf Waldukat Date: Tue, 13 Jan 2026 23:08:46 +0700 Subject: [PATCH] fix: clear prompt for recurrent / hybrid models when only a partial prefix matches --- .github/workflows/test.yaml | 20 +++++-- CHANGELOG.md | 1 + llama_cpp/llama.py | 22 ++++++++ tests/test_llama.py | 106 ++++++++++++++++++++++++++++++++++++ 4 files changed, 143 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 8a6845ff24..b8f5566bb0 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -10,6 +10,11 @@ on: env: REPO_ID: lmstudio-community/Qwen3.5-0.8B-GGUF MODEL_FILE: Qwen3.5-0.8B-Q8_0.gguf + RECURRENT_REPO_ID: QuantFactory/mamba-130m-hf-GGUF + RECURRENT_MODEL_FILE: mamba-130m-hf.Q2_K.gguf + HYBRID_REPO_ID: tiiuae/Falcon-H1-Tiny-90M-Instruct-GGUF + HYBRID_MODEL_FILE: Falcon-H1-Tiny-90M-Instruct-Q2_K.gguf + MODEL_CACHE_KEY: qwen35-q8-mamba130m-q2-falconh1tiny-q2 jobs: download-model: @@ -22,12 +27,15 @@ jobs: - name: Install huggingface-hub run: pip install huggingface-hub - name: Download model - run: hf download ${{ env.REPO_ID }} ${{ env.MODEL_FILE }} + run: | + hf download ${{ env.REPO_ID }} ${{ env.MODEL_FILE }} + hf download ${{ env.RECURRENT_REPO_ID }} ${{ env.RECURRENT_MODEL_FILE }} + hf download ${{ env.HYBRID_REPO_ID }} ${{ env.HYBRID_MODEL_FILE }} - name: Cache model uses: actions/cache@v4 with: path: ~/.cache/huggingface/hub - key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }} + key: ${{ runner.os }}-model-${{ env.MODEL_CACHE_KEY }} build-linux: needs: download-model @@ -49,7 +57,7 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/huggingface/hub - key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }} + key: ${{ runner.os }}-model-${{ env.MODEL_CACHE_KEY }} - name: Install dependencies (Linux/MacOS) run: | python -m pip install --upgrade pip @@ -81,7 +89,7 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/huggingface/hub - key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }} + key: ${{ runner.os }}-model-${{ env.MODEL_CACHE_KEY }} - name: Install dependencies (Windows) run: | @@ -121,7 +129,7 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/huggingface/hub - key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }} + key: ${{ runner.os }}-model-${{ env.MODEL_CACHE_KEY }} - name: Install dependencies (Linux/MacOS) run: | @@ -157,7 +165,7 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/huggingface/hub - key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }} + key: ${{ runner.os }}-model-${{ env.MODEL_CACHE_KEY }} - name: Install dependencies run: | diff --git a/CHANGELOG.md b/CHANGELOG.md index 51a0abad85..21fa49f28e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix: clear prompt for recurrent / hybrid models when only a partial prefix matches by @avion23 in #2108 - fix: match Transformers `tojson` in chat template rendering by @CISC in #1486 - fix: use env var configured multimodal library override paths when loading shared libraries by @navratil-matej in #1782 - feat: add Jinja2 loop controls to chat templates by @handshape in #2018 diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 75c74b41fc..b904b4080c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -559,6 +559,10 @@ def free_lora_adapter(): self._sampler = None + # Cache recurrent/hybrid model detection to avoid repeated FFI calls + self._is_recurrent = llama_cpp.llama_model_is_recurrent(self._model.model) + self._is_hybrid = llama_cpp.llama_model_is_hybrid(self._model.model) + @property def ctx(self) -> llama_cpp.llama_context_p: return self._ctx.ctx @@ -644,6 +648,11 @@ def reset(self): """Reset the model state.""" self.n_tokens = 0 + if self._is_recurrent or self._is_hybrid: + mem = llama_cpp.llama_get_memory(self._ctx.ctx) + if mem is not None: + llama_cpp.llama_memory_clear(mem, True) + def eval(self, tokens: Sequence[int]): """Evaluate a list of tokens. @@ -899,6 +908,19 @@ def generate( longest_prefix += 1 else: break + + # Recurrent and hybrid models cannot rewind state; reset if needed + if ( + self._is_recurrent or self._is_hybrid + ) and longest_prefix < self.n_tokens: + longest_prefix = 0 + reset = True + if self.verbose: + print( + "Llama.generate: recurrent/hybrid model requires full state reset", + file=sys.stderr, + ) + if longest_prefix > 0: if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1): reset = False diff --git a/tests/test_llama.py b/tests/test_llama.py index d4e6031c70..336d6a6122 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -72,6 +72,22 @@ def llama_cpp_embedding_model_path(): return model_path +@pytest.fixture +def llama_cpp_recurrent_model_path(): + repo_id = "QuantFactory/mamba-130m-hf-GGUF" + filename = "mamba-130m-hf.Q2_K.gguf" + model_path = hf_hub_download(repo_id, filename) + return model_path + + +@pytest.fixture +def llama_cpp_hybrid_model_path(): + repo_id = "tiiuae/Falcon-H1-Tiny-90M-Instruct-GGUF" + filename = "Falcon-H1-Tiny-90M-Instruct-Q2_K.gguf" + model_path = hf_hub_download(repo_id, filename) + return model_path + + def test_real_model(llama_cpp_model_path): import os @@ -233,6 +249,96 @@ def logit_processor_func(input_ids, logits): assert number_1 == number_3 +def test_real_llama_repeated_prompt_cache(llama_cpp_model_path): + model = llama_cpp.Llama( + llama_cpp_model_path, + n_ctx=32, + n_batch=32, + n_ubatch=32, + n_threads=multiprocessing.cpu_count(), + n_threads_batch=multiprocessing.cpu_count(), + logits_all=False, + flash_attn=True, + verbose=False, + ) + prompt = "The quick brown fox jumps over the lazy dog. The quick brown fox" + + output_1 = model.create_completion( + prompt, + max_tokens=6, + temperature=0.0, + seed=1337, + ) + output_2 = model.create_completion( + prompt, + max_tokens=6, + temperature=0.0, + seed=1337, + ) + + assert output_1["choices"][0]["text"] == " jumps over the lazy dog." + assert output_2["choices"][0]["text"] == output_1["choices"][0]["text"] + + +def _assert_prompt_cache_reset_handles_history_edit( + model_path, + *, + is_recurrent: bool, + is_hybrid: bool, +): + model = llama_cpp.Llama( + model_path, + n_ctx=32, + n_batch=32, + n_ubatch=32, + n_threads=multiprocessing.cpu_count(), + n_threads_batch=multiprocessing.cpu_count(), + logits_all=False, + verbose=False, + ) + + assert model._is_recurrent is is_recurrent + assert model._is_hybrid is is_hybrid + + first_prompt = "The quick brown fox" + second_prompt = "The slow brown fox" + first_tokens = model.tokenize(first_prompt.encode(), add_bos=True, special=True) + second_tokens = model.tokenize(second_prompt.encode(), add_bos=True, special=True) + + assert first_tokens != second_tokens + assert first_tokens[0] == second_tokens[0] + + first_output = model.create_completion( + first_prompt, + max_tokens=1, + temperature=0.0, + ) + assert isinstance(first_output["choices"][0]["text"], str) + + second_output = model.create_completion( + second_prompt, + max_tokens=1, + temperature=0.0, + ) + assert isinstance(second_output["choices"][0]["text"], str) + + +def test_recurrent_model_prompt_cache_reset(llama_cpp_recurrent_model_path): + _assert_prompt_cache_reset_handles_history_edit( + llama_cpp_recurrent_model_path, + is_recurrent=True, + is_hybrid=False, + ) + + +def test_hybrid_model_prompt_cache_reset(llama_cpp_hybrid_model_path): + _assert_prompt_cache_reset_handles_history_edit( + llama_cpp_hybrid_model_path, + is_recurrent=False, + is_hybrid=True, + ) + + def test_real_llama_embeddings(llama_cpp_embedding_model_path): model = llama_cpp.Llama( llama_cpp_embedding_model_path,