From bc507c0edad02f3bde3d5936f7a9bbbd3922a3c1 Mon Sep 17 00:00:00 2001 From: Lalit Shrotriya Date: Thu, 18 Jun 2026 10:21:36 +0000 Subject: [PATCH] fix(supply_chain): parse pyproject.toml with tomllib instead of requirements regex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _analyze_dependencies routed pyproject.toml through _extract_packages_from_requirements, which treats every TOML key (requires-python, name, description, authors…) as a package name, causing false-positive SC5/SC6 findings on metadata fields. Add _extract_packages_from_pyproject (Python 3.11+ tomllib, stdlib) that reads only [project].dependencies, [project.optional-dependencies], and [build-system].requires — the three PEP 621 / PEP 517 dependency arrays. A frozen set of PEP 621 metadata keys acts as a secondary guard. Malformed TOML is caught and returns [] so the analyzer never crashes. Closes #2 Signed-off-by: Lalit Shrotriya --- .../analyzers/static_patterns_supply_chain.py | 89 ++++++++++++++++++- tests/unit/test_patterns_new.py | 82 +++++++++++++++++ 2 files changed, 170 insertions(+), 1 deletion(-) diff --git a/src/skillspector/nodes/analyzers/static_patterns_supply_chain.py b/src/skillspector/nodes/analyzers/static_patterns_supply_chain.py index 3a4fcac..f92bcf9 100644 --- a/src/skillspector/nodes/analyzers/static_patterns_supply_chain.py +++ b/src/skillspector/nodes/analyzers/static_patterns_supply_chain.py @@ -28,6 +28,7 @@ import re import sys +import tomllib from skillspector.logging_config import get_logger from skillspector.models import AnalyzerFinding, Finding, Location, Severity @@ -401,6 +402,89 @@ def _extract_packages_from_requirements(content: str) -> list[tuple[str, str | N return results +_PEP621_METADATA_KEYS = frozenset( + { + "name", + "version", + "description", + "readme", + "license", + "authors", + "maintainers", + "keywords", + "classifiers", + "urls", + "requires-python", + "dynamic", + } +) + +_PKG_NAME_RE = re.compile( + r"^([a-zA-Z][a-zA-Z0-9._-]*)(?:\[.*?\])?\s*(?:[=<>!~]=?\s*[\d.*]+)?", +) + + +def _extract_packages_from_pyproject(content: str) -> list[tuple[str, str | None, int]]: + """Extract (package_name, version_or_None, line_number) from pyproject.toml content. + + Only reads PEP 621 ``[project].dependencies``, + ``[project.optional-dependencies]``, and ``[build-system].requires`` so that + TOML metadata keys (``name``, ``requires-python``, etc.) are never treated + as package names. Returns ``[]`` on any TOML parse error. + """ + if not content.strip(): + return [] + try: + data = tomllib.loads(content) + except tomllib.TOMLDecodeError: + return [] + + dep_strings: list[str] = [] + + project = data.get("project", {}) + if isinstance(project, dict): + # [project].dependencies + deps = project.get("dependencies", []) + if isinstance(deps, list): + dep_strings.extend(str(d) for d in deps) + # [project.optional-dependencies] + opt_deps = project.get("optional-dependencies", {}) + if isinstance(opt_deps, dict): + for group in opt_deps.values(): + if isinstance(group, list): + dep_strings.extend(str(d) for d in group) + + build_system = data.get("build-system", {}) + if isinstance(build_system, dict): + requires = build_system.get("requires", []) + if isinstance(requires, list): + dep_strings.extend(str(d) for d in requires) + + lines = content.splitlines() + results: list[tuple[str, str | None, int]] = [] + for dep_str in dep_strings: + m = _PKG_NAME_RE.match(dep_str.strip()) + if not m: + continue + pkg_name = m.group(1) + if pkg_name.lower() in _PEP621_METADATA_KEYS: + continue + + # Approximate line number: find the package name in the raw content. + line_num = 1 + for i, line in enumerate(lines, 1): + if pkg_name.lower() in line.lower(): + line_num = i + break + + # Extract pinned version for SC4 comparison (== or <=). + ver_m = re.search(r"(?:==|<=)\s*([\d.]+)", dep_str) + version = ver_m.group(1) if ver_m else None + results.append((pkg_name, version, line_num)) + + return results + + def _extract_packages_from_package_json(content: str) -> list[tuple[str, str | None, int]]: """Extract (package_name, version_or_None, line_number) from package.json content.""" results: list[tuple[str, str | None, int]] = [] @@ -695,7 +779,10 @@ def _analyze_dependencies( return findings if is_python_dep: - packages = _extract_packages_from_requirements(content) + if "pyproject.toml" in lower_path: + packages = _extract_packages_from_pyproject(content) + else: + packages = _extract_packages_from_requirements(content) ecosystem = ECOSYSTEM_PYPI fallback_db = _FALLBACK_VULNERABLE_PYPI popular = _POPULAR_PYPI diff --git a/tests/unit/test_patterns_new.py b/tests/unit/test_patterns_new.py index 329e2aa..0290cf1 100644 --- a/tests/unit/test_patterns_new.py +++ b/tests/unit/test_patterns_new.py @@ -1037,3 +1037,85 @@ def test_extract_packages_package_json(self) -> None: names = [p[0] for p in sc_mod._extract_packages_from_package_json(content)] assert "express" in names assert "lodash" in names + + def test_extract_packages_from_pyproject_basic(self) -> None: + """pyproject.toml: only [project].dependencies entries are extracted.""" + content = ( + "[project]\n" + 'name = "my-skill"\n' + 'version = "1.0.0"\n' + 'requires-python = ">=3.12"\n' + 'description = "A skill"\n' + 'dependencies = ["requests>=2.31", "pydantic==2.0.0"]\n' + ) + names = [p[0] for p in sc_mod._extract_packages_from_pyproject(content)] + assert "requests" in names + assert "pydantic" in names + # PEP 621 metadata fields must NOT appear as package names + assert "name" not in names + assert "requires-python" not in names + assert "description" not in names + assert "version" not in names + + def test_extract_packages_from_pyproject_build_system(self) -> None: + """[build-system].requires entries are also extracted.""" + content = ( + "[build-system]\n" + 'requires = ["setuptools>=68", "wheel"]\n' + 'build-backend = "setuptools.build_meta"\n' + ) + names = [p[0] for p in sc_mod._extract_packages_from_pyproject(content)] + assert "setuptools" in names + assert "wheel" in names + assert "build-backend" not in names + + def test_extract_packages_from_pyproject_optional_deps(self) -> None: + """[project.optional-dependencies] entries are also extracted.""" + content = ( + "[project]\n" + 'name = "skill"\n' + "\n" + "[project.optional-dependencies]\n" + 'dev = ["pytest>=7.0", "ruff"]\n' + ) + names = [p[0] for p in sc_mod._extract_packages_from_pyproject(content)] + assert "pytest" in names + assert "ruff" in names + assert "name" not in names + + def test_extract_packages_from_pyproject_malformed_toml(self) -> None: + """Malformed TOML must not crash the analyzer — return empty list.""" + content = "[project\nname = bad toml [\n" + result = sc_mod._extract_packages_from_pyproject(content) + assert result == [] + + def test_extract_packages_from_pyproject_empty(self) -> None: + """Empty pyproject.toml returns empty list.""" + assert sc_mod._extract_packages_from_pyproject("") == [] + + def test_pyproject_toml_no_false_positive_metadata_keys(self) -> None: + """End-to-end: scanning pyproject.toml must not fire SC5/SC6 on metadata keys.""" + content = ( + "[project]\n" + 'name = "my-skill"\n' + 'version = "1.0.0"\n' + 'description = "helper"\n' + 'requires-python = ">=3.12"\n' + 'authors = [{name = "Dev"}]\n' + 'keywords = ["ai", "agent"]\n' + 'dependencies = ["httpx>=0.24"]\n' + ) + state = { + "components": ["pyproject.toml"], + "file_cache": {"pyproject.toml": content}, + } + from skillspector.nodes.analyzers import static_patterns_supply_chain as sc + from skillspector.nodes.analyzers import static_runner + + findings = static_runner.run_static_patterns(state, [sc]) + rule_ids = [f.rule_id for f in findings] + # SC5/SC6 must not fire for 'name', 'description', 'requires-python', etc. + for f in findings: + assert f.rule_id not in ("SC5", "SC6") or f.matched_text.lower() in ( + "httpx", + ), f"False positive SC5/SC6 on metadata key: {f.matched_text}"