fix(skills): cache GitHub repo trees to avoid rate-limit exhaustion on install

Skills.sh installs hit the GitHub API 45 times per install because the same repo tree was fetched 6 times redundantly. Combined with search (23 API calls), this totals 68 — exceeding the unauthenticated rate limit of 60 req/hr, causing 'Could not fetch' errors for users without a GITHUB_TOKEN. Changes: - Add _get_repo_tree() cache to GitHubSource — repo info + recursive tree fetched once per repo per source instance, eliminating 10 redundant API calls (6 tree + 4 candidate 404s) - _download_directory_via_tree returns {} (not None) when cached tree shows path doesn't exist, skipping unnecessary Contents API fallback - _check_rate_limit_response() detects exhausted quota and sets is_rate_limited flag - do_install() shows actionable hint when rate limited: set GITHUB_TOKEN or install gh CLI Before: 45 API calls per install (68 total with search) After: 31 API calls per install (54 total with search — under 60/hr) Reported by community user from Vietnam (no GitHub auth configured).
2026-04-12 13:13:01 -07:00
parent 4c6ebd077e
commit 7e0e5ea03b
2 changed files with 116 additions and 56 deletions
--- a/hermes_cli/skills_hub.py
+++ b/hermes_cli/skills_hub.py
@ -335,7 +335,23 @@ def do_install(identifier: str, category: str = "", force: bool = False,
    meta, bundle, _matched_source = _resolve_source_meta_and_bundle(identifier, sources)

    if not bundle:
-        c.print(f"[bold red]Error:[/] Could not fetch '{identifier}' from any source.\n")
+        # Check if any source hit GitHub API rate limit
+        rate_limited = any(
+            getattr(src, "is_rate_limited", False)
+            or getattr(getattr(src, "github", None), "is_rate_limited", False)
+            for src in sources
+        )
+        c.print(f"[bold red]Error:[/] Could not fetch '{identifier}' from any source.")
+        if rate_limited:
+            c.print(
+                "[yellow]Hint:[/] GitHub API rate limit exhausted "
+                "(unauthenticated: 60 requests/hour).\n"
+                "Set [bold]GITHUB_TOKEN[/] in your .env or install the "
+                "[bold]gh[/] CLI and run [bold]gh auth login[/] "
+                "to raise the limit to 5,000/hr.\n"
+            )
+        else:
+            c.print()
        return

    # Auto-detect category for official skills (e.g. "official/autonomous-ai-agents/blackbox")
--- a/tools/skills_hub.py
+++ b/tools/skills_hub.py
@ -296,10 +296,20 @@ class GitHubSource(SkillSource):
        self.taps = list(self.DEFAULT_TAPS)
        if extra_taps:
            self.taps.extend(extra_taps)
+        # Per-instance cache: repo -> (default_branch, tree_entries)
+        # Survives within a single search/install flow, avoiding redundant API calls.
+        self._tree_cache: Dict[str, Tuple[str, List[dict]]] = {}
+        # Set when GitHub returns 403 with rate limit exhausted
+        self._rate_limited: bool = False

    def source_id(self) -> str:
        return "github"

+    @property
+    def is_rate_limited(self) -> bool:
+        """Whether GitHub API rate limit was hit during operations."""
+        return self._rate_limited
+
    def trust_level_for(self, identifier: str) -> str:
        # identifier format: "owner/repo/path/to/skill"
        parts = identifier.split("/", 2)
@ -443,6 +453,69 @@ class GitHubSource(SkillSource):
        self._write_cache(cache_key, [self._meta_to_dict(s) for s in skills])
        return skills

+    # -- Repo tree cache (avoids redundant API calls) --
+
+    def _get_repo_tree(self, repo: str) -> Optional[Tuple[str, List[dict]]]:
+        """Get cached or fresh repo tree.
+
+        Returns ``(default_branch, tree_entries)`` or ``None``.
+        A single install can call ``_download_directory_via_tree`` and
+        ``_find_skill_in_repo_tree`` multiple times for the same repo — this
+        cache eliminates the redundant ``GET /repos/{repo}`` +
+        ``GET /repos/{repo}/git/trees/{branch}`` round-trips (previously up to
+        6 duplicated pairs per install, consuming ~12 of the 60/hr
+        unauthenticated rate limit for nothing).
+        """
+        if repo in self._tree_cache:
+            return self._tree_cache[repo]
+
+        headers = self.auth.get_headers()
+
+        # Resolve default branch
+        try:
+            resp = httpx.get(
+                f"https://api.github.com/repos/{repo}",
+                headers=headers, timeout=15, follow_redirects=True,
+            )
+            if resp.status_code != 200:
+                self._check_rate_limit_response(resp)
+                return None
+            default_branch = resp.json().get("default_branch", "main")
+        except (httpx.HTTPError, ValueError):
+            return None
+
+        # Fetch recursive tree
+        try:
+            resp = httpx.get(
+                f"https://api.github.com/repos/{repo}/git/trees/{default_branch}",
+                params={"recursive": "1"},
+                headers=headers, timeout=30, follow_redirects=True,
+            )
+            if resp.status_code != 200:
+                self._check_rate_limit_response(resp)
+                return None
+            tree_data = resp.json()
+            if tree_data.get("truncated"):
+                logger.debug("Git tree truncated for %s, cannot cache", repo)
+                return None
+        except (httpx.HTTPError, ValueError):
+            return None
+
+        entries = tree_data.get("tree", [])
+        self._tree_cache[repo] = (default_branch, entries)
+        return (default_branch, entries)
+
+    def _check_rate_limit_response(self, resp: "httpx.Response") -> None:
+        """Flag the instance as rate-limited when GitHub returns 403 + exhausted quota."""
+        if resp.status_code == 403:
+            remaining = resp.headers.get("X-RateLimit-Remaining", "")
+            if remaining == "0":
+                self._rate_limited = True
+                logger.warning(
+                    "GitHub API rate limit exhausted (unauthenticated: 60 req/hr). "
+                    "Set GITHUB_TOKEN or install the gh CLI to raise the limit to 5,000/hr."
+                )
+
    def _download_directory(self, repo: str, path: str) -> Dict[str, str]:
        """Recursively download all text files from a GitHub directory.

@ -458,40 +531,34 @@ class GitHubSource(SkillSource):
        return self._download_directory_recursive(repo, path)

    def _download_directory_via_tree(self, repo: str, path: str) -> Optional[Dict[str, str]]:
-        """Download an entire directory using the Git Trees API (single request)."""
+        """Download an entire directory using the Git Trees API (single request).
+
+        Returns:
+            dict of files if the path exists and has content,
+            empty dict ``{}`` if the tree is cached but the path doesn't exist
+            (prevents unnecessary Contents API fallback),
+            ``None`` if the tree couldn't be fetched (triggers Contents API fallback).
+        """
        path = path.rstrip("/")
-        headers = self.auth.get_headers()

-        # Resolve the default branch via the repo endpoint
-        try:
-            repo_url = f"https://api.github.com/repos/{repo}"
-            resp = httpx.get(repo_url, headers=headers, timeout=15, follow_redirects=True)
-            if resp.status_code != 200:
-                return None
-            default_branch = resp.json().get("default_branch", "main")
-        except (httpx.HTTPError, ValueError):
+        cached = self._get_repo_tree(repo)
+        if cached is None:
            return None
+        _default_branch, tree_entries = cached

-        # Fetch the full recursive tree (branch name works as tree-ish)
-        try:
-            tree_url = f"https://api.github.com/repos/{repo}/git/trees/{default_branch}"
-            resp = httpx.get(
-                tree_url, params={"recursive": "1"},
-                headers=headers, timeout=30, follow_redirects=True,
-            )
-            if resp.status_code != 200:
-                return None
-            tree_data = resp.json()
-            if tree_data.get("truncated"):
-                logger.debug("Git tree truncated for %s, falling back to Contents API", repo)
-                return None
-        except (httpx.HTTPError, ValueError):
-            return None
+        # Check if ANY entry lives under the target path
+        prefix = f"{path}/"
+        has_entries = any(
+            item.get("path", "").startswith(prefix) for item in tree_entries
+        )
+        if not has_entries:
+            # Path definitively doesn't exist in the repo — return empty
+            # instead of None to skip the Contents API fallback.
+            return {}

        # Filter to blobs under our target path and fetch content
-        prefix = f"{path}/"
        files: Dict[str, str] = {}
-        for item in tree_data.get("tree", []):
+        for item in tree_entries:
            if item.get("type") != "blob":
                continue
            item_path = item.get("path", "")
@ -548,38 +615,14 @@ class GitHubSource(SkillSource):
        handles deeply nested directory structures like
        ``cli-tool/components/skills/development/<skill>/SKILL.md``.
        """
-        # Get default branch
-        try:
-            resp = httpx.get(
-                f"https://api.github.com/repos/{repo}",
-                headers=self.auth.get_headers(),
-                timeout=15,
-                follow_redirects=True,
-            )
-            if resp.status_code != 200:
-                return None
-            default_branch = resp.json().get("default_branch", "main")
-        except (httpx.HTTPError, json.JSONDecodeError):
-            return None
-
-        # Get recursive tree (single API call for the entire repo)
-        try:
-            resp = httpx.get(
-                f"https://api.github.com/repos/{repo}/git/trees/{default_branch}",
-                params={"recursive": "1"},
-                headers=self.auth.get_headers(),
-                timeout=30,
-                follow_redirects=True,
-            )
-            if resp.status_code != 200:
-                return None
-            tree_data = resp.json()
-        except (httpx.HTTPError, json.JSONDecodeError):
+        cached = self._get_repo_tree(repo)
+        if cached is None:
            return None
+        _default_branch, tree_entries = cached

        # Look for SKILL.md files inside directories named <skill_name>
        skill_md_suffix = f"/{skill_name}/SKILL.md"
-        for entry in tree_data.get("tree", []):
+        for entry in tree_entries:
            if entry.get("type") != "blob":
                continue
            path = entry.get("path", "")
@ -601,6 +644,7 @@ class GitHubSource(SkillSource):
            )
            if resp.status_code == 200:
                return resp.text
+            self._check_rate_limit_response(resp)
        except httpx.HTTPError as e:
            logger.debug("GitHub contents API fetch failed: %s", e)
        return None