diff --git a/README.md b/README.md index 22b263585..e949142df 100644 --- a/README.md +++ b/README.md @@ -278,6 +278,7 @@ GuardDog's behavior can be customized using environment variables: | `GUARDDOG_VERIFY_EXHAUSTIVE_DEPENDENCIES` | Analyze all possible versions of dependencies (`true`/`false`) | `false` | | `GUARDDOG_TOP_PACKAGES_CACHE_LOCATION` | Location of the top packages cache directory | `guarddog/analyzer/metadata/resources` | | `GUARDDOG_YARA_EXT_EXCLUDE` | Comma-separated list of file extensions to exclude from YARA scanning | `ini,md,rst,txt,lock,json,yaml,yml,toml,xml,html,csv,sql,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,changelog,readme,makefile,dockerfile,pkg-info,d.ts` | +| `GUARDDOG_YARA_PATH_EXCLUDE_REGEX` | Regex pattern to exclude files from YARA scanning (matched against relative file path) | _empty_ (disabled) | #### Archive Extraction Security Limits diff --git a/guarddog/analyzer/analyzer.py b/guarddog/analyzer/analyzer.py index fe8259593..57947e8a9 100644 --- a/guarddog/analyzer/analyzer.py +++ b/guarddog/analyzer/analyzer.py @@ -1,5 +1,6 @@ import logging import os +import re import yara # type: ignore from collections import defaultdict @@ -17,7 +18,7 @@ validate_identifies, validate_mitre_tactics, ) -from guarddog.utils.config import YARA_EXT_EXCLUDE +from guarddog.utils.config import YARA_EXT_EXCLUDE, YARA_PATH_EXCLUDE_REGEX from guarddog.ecosystems import ECOSYSTEM, LANGUAGE SOURCECODE_RULES_PATH = os.path.join(os.path.dirname(__file__), "sourcecode") @@ -378,6 +379,22 @@ def analyze_yara(self, path: str, rules: Optional[set] = None) -> dict: log.debug("No yara rules to run") return {"results": results, "errors": errors, "issues": issues} + regex_exclude = None + if YARA_PATH_EXCLUDE_REGEX: + try: + regex_exclude = re.compile(YARA_PATH_EXCLUDE_REGEX) + except re.error as e: + return { + "results": results, + "errors": { + "rules-all": ( + "failed to run rule: invalid " + f"GUARDDOG_YARA_PATH_EXCLUDE_REGEX: {str(e)}" + ) + }, + "issues": issues, + } + import time # Get rule metadata to access max_hits @@ -425,6 +442,10 @@ def analyze_yara(self, path: str, rules: Optional[set] = None) -> dict: scan_file_target_abspath, path ) + # Skip files matching global exclusion regex + if regex_exclude and regex_exclude.search(scan_file_target_relpath): + continue + # Check path_include patterns if specified (takes precedence) if path_include: patterns = [p.strip() for p in path_include.split(",")] diff --git a/guarddog/utils/config.py b/guarddog/utils/config.py index 1824cf11b..9ada21f18 100644 --- a/guarddog/utils/config.py +++ b/guarddog/utils/config.py @@ -40,6 +40,15 @@ "pptx,xls,xlsx,odt,changelog,readme,makefile,dockerfile,pkg-info,d.ts", ).split(",") +""" +This parameter specifies a regex pattern to exclude files from YARA scanning. +The regex is matched against the file path relative to the scan root. +- Default: empty (disabled) +""" +YARA_PATH_EXCLUDE_REGEX: str = os.environ.get( + "GUARDDOG_YARA_PATH_EXCLUDE_REGEX", "" +) + """ This parameter specifies the maximum uncompressed size allowed for archive extraction - Default: 2 GB in bytes diff --git a/tests/core/test_sourcecode_analyzer.py b/tests/core/test_sourcecode_analyzer.py index 6d92ebf02..ed9435ecb 100644 --- a/tests/core/test_sourcecode_analyzer.py +++ b/tests/core/test_sourcecode_analyzer.py @@ -6,6 +6,7 @@ from guarddog import ecosystems from guarddog.analyzer.analyzer import Analyzer +import guarddog.analyzer.analyzer as analyzer_module from guarddog.ecosystems import LANGUAGE pypi_analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.PYPI) @@ -69,6 +70,49 @@ def test_get_snippet_file_not_found(): assert snippet == "" +def test_analyze_yara_excludes_files_matching_regex(): + analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.PYPI) + rule = next(iter(analyzer.yara_ruleset)) + + class FakeCompiledRules: + def __init__(self): + self.scanned_files = [] + + def match(self, file_path): + self.scanned_files.append(file_path) + return [] + + fake_compiled_rules = FakeCompiledRules() + + with patch.object(analyzer_module, "YARA_EXT_EXCLUDE", []), patch.object( + analyzer_module, "YARA_PATH_EXCLUDE_REGEX", r"\.min\.js$" + ), patch.object( + analyzer_module.yara, "compile", return_value=fake_compiled_rules + ), patch.object( + analyzer_module.os, + "walk", + return_value=[("/tmp/pkg", [], ["keep.py", "bundle.min.js"])], + ): + analyzer.analyze_yara("/tmp/pkg", {rule}) + + assert "/tmp/pkg/keep.py" in fake_compiled_rules.scanned_files + assert "/tmp/pkg/bundle.min.js" not in fake_compiled_rules.scanned_files + + +def test_analyze_yara_returns_error_for_invalid_exclude_regex(): + analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.PYPI) + rule = next(iter(analyzer.yara_ruleset)) + + with patch.object(analyzer_module, "YARA_PATH_EXCLUDE_REGEX", "("), patch.object( + analyzer_module.yara, "compile" + ) as yara_compile: + result = analyzer.analyze_yara("/tmp/pkg", {rule}) + + assert "rules-all" in result["errors"] + assert "GUARDDOG_YARA_PATH_EXCLUDE_REGEX" in result["errors"]["rules-all"] + yara_compile.assert_not_called() + + # Comment filtering tests