fix(skills): cache GitHub repo trees to avoid rate-limit exhaustion on install
Skills.sh installs hit the GitHub API 45 times per install because the
same repo tree was fetched 6 times redundantly. Combined with search
(23 API calls), this totals 68 — exceeding the unauthenticated rate
limit of 60 req/hr, causing 'Could not fetch' errors for users without
a GITHUB_TOKEN.
Changes:
- Add _get_repo_tree() cache to GitHubSource — repo info + recursive
tree fetched once per repo per source instance, eliminating 10
redundant API calls (6 tree + 4 candidate 404s)
- _download_directory_via_tree returns {} (not None) when cached tree
shows path doesn't exist, skipping unnecessary Contents API fallback
- _check_rate_limit_response() detects exhausted quota and sets
is_rate_limited flag
- do_install() shows actionable hint when rate limited: set
GITHUB_TOKEN or install gh CLI
Before: 45 API calls per install (68 total with search)
After: 31 API calls per install (54 total with search — under 60/hr)
Reported by community user from Vietnam (no GitHub auth configured).
This commit is contained in:
@ -335,7 +335,23 @@ def do_install(identifier: str, category: str = "", force: bool = False,
|
||||
meta, bundle, _matched_source = _resolve_source_meta_and_bundle(identifier, sources)
|
||||
|
||||
if not bundle:
|
||||
c.print(f"[bold red]Error:[/] Could not fetch '{identifier}' from any source.\n")
|
||||
# Check if any source hit GitHub API rate limit
|
||||
rate_limited = any(
|
||||
getattr(src, "is_rate_limited", False)
|
||||
or getattr(getattr(src, "github", None), "is_rate_limited", False)
|
||||
for src in sources
|
||||
)
|
||||
c.print(f"[bold red]Error:[/] Could not fetch '{identifier}' from any source.")
|
||||
if rate_limited:
|
||||
c.print(
|
||||
"[yellow]Hint:[/] GitHub API rate limit exhausted "
|
||||
"(unauthenticated: 60 requests/hour).\n"
|
||||
"Set [bold]GITHUB_TOKEN[/] in your .env or install the "
|
||||
"[bold]gh[/] CLI and run [bold]gh auth login[/] "
|
||||
"to raise the limit to 5,000/hr.\n"
|
||||
)
|
||||
else:
|
||||
c.print()
|
||||
return
|
||||
|
||||
# Auto-detect category for official skills (e.g. "official/autonomous-ai-agents/blackbox")
|
||||
|
||||
@ -296,10 +296,20 @@ class GitHubSource(SkillSource):
|
||||
self.taps = list(self.DEFAULT_TAPS)
|
||||
if extra_taps:
|
||||
self.taps.extend(extra_taps)
|
||||
# Per-instance cache: repo -> (default_branch, tree_entries)
|
||||
# Survives within a single search/install flow, avoiding redundant API calls.
|
||||
self._tree_cache: Dict[str, Tuple[str, List[dict]]] = {}
|
||||
# Set when GitHub returns 403 with rate limit exhausted
|
||||
self._rate_limited: bool = False
|
||||
|
||||
def source_id(self) -> str:
|
||||
return "github"
|
||||
|
||||
@property
|
||||
def is_rate_limited(self) -> bool:
|
||||
"""Whether GitHub API rate limit was hit during operations."""
|
||||
return self._rate_limited
|
||||
|
||||
def trust_level_for(self, identifier: str) -> str:
|
||||
# identifier format: "owner/repo/path/to/skill"
|
||||
parts = identifier.split("/", 2)
|
||||
@ -443,6 +453,69 @@ class GitHubSource(SkillSource):
|
||||
self._write_cache(cache_key, [self._meta_to_dict(s) for s in skills])
|
||||
return skills
|
||||
|
||||
# -- Repo tree cache (avoids redundant API calls) --
|
||||
|
||||
def _get_repo_tree(self, repo: str) -> Optional[Tuple[str, List[dict]]]:
|
||||
"""Get cached or fresh repo tree.
|
||||
|
||||
Returns ``(default_branch, tree_entries)`` or ``None``.
|
||||
A single install can call ``_download_directory_via_tree`` and
|
||||
``_find_skill_in_repo_tree`` multiple times for the same repo — this
|
||||
cache eliminates the redundant ``GET /repos/{repo}`` +
|
||||
``GET /repos/{repo}/git/trees/{branch}`` round-trips (previously up to
|
||||
6 duplicated pairs per install, consuming ~12 of the 60/hr
|
||||
unauthenticated rate limit for nothing).
|
||||
"""
|
||||
if repo in self._tree_cache:
|
||||
return self._tree_cache[repo]
|
||||
|
||||
headers = self.auth.get_headers()
|
||||
|
||||
# Resolve default branch
|
||||
try:
|
||||
resp = httpx.get(
|
||||
f"https://api.github.com/repos/{repo}",
|
||||
headers=headers, timeout=15, follow_redirects=True,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
self._check_rate_limit_response(resp)
|
||||
return None
|
||||
default_branch = resp.json().get("default_branch", "main")
|
||||
except (httpx.HTTPError, ValueError):
|
||||
return None
|
||||
|
||||
# Fetch recursive tree
|
||||
try:
|
||||
resp = httpx.get(
|
||||
f"https://api.github.com/repos/{repo}/git/trees/{default_branch}",
|
||||
params={"recursive": "1"},
|
||||
headers=headers, timeout=30, follow_redirects=True,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
self._check_rate_limit_response(resp)
|
||||
return None
|
||||
tree_data = resp.json()
|
||||
if tree_data.get("truncated"):
|
||||
logger.debug("Git tree truncated for %s, cannot cache", repo)
|
||||
return None
|
||||
except (httpx.HTTPError, ValueError):
|
||||
return None
|
||||
|
||||
entries = tree_data.get("tree", [])
|
||||
self._tree_cache[repo] = (default_branch, entries)
|
||||
return (default_branch, entries)
|
||||
|
||||
def _check_rate_limit_response(self, resp: "httpx.Response") -> None:
|
||||
"""Flag the instance as rate-limited when GitHub returns 403 + exhausted quota."""
|
||||
if resp.status_code == 403:
|
||||
remaining = resp.headers.get("X-RateLimit-Remaining", "")
|
||||
if remaining == "0":
|
||||
self._rate_limited = True
|
||||
logger.warning(
|
||||
"GitHub API rate limit exhausted (unauthenticated: 60 req/hr). "
|
||||
"Set GITHUB_TOKEN or install the gh CLI to raise the limit to 5,000/hr."
|
||||
)
|
||||
|
||||
def _download_directory(self, repo: str, path: str) -> Dict[str, str]:
|
||||
"""Recursively download all text files from a GitHub directory.
|
||||
|
||||
@ -458,40 +531,34 @@ class GitHubSource(SkillSource):
|
||||
return self._download_directory_recursive(repo, path)
|
||||
|
||||
def _download_directory_via_tree(self, repo: str, path: str) -> Optional[Dict[str, str]]:
|
||||
"""Download an entire directory using the Git Trees API (single request)."""
|
||||
"""Download an entire directory using the Git Trees API (single request).
|
||||
|
||||
Returns:
|
||||
dict of files if the path exists and has content,
|
||||
empty dict ``{}`` if the tree is cached but the path doesn't exist
|
||||
(prevents unnecessary Contents API fallback),
|
||||
``None`` if the tree couldn't be fetched (triggers Contents API fallback).
|
||||
"""
|
||||
path = path.rstrip("/")
|
||||
headers = self.auth.get_headers()
|
||||
|
||||
# Resolve the default branch via the repo endpoint
|
||||
try:
|
||||
repo_url = f"https://api.github.com/repos/{repo}"
|
||||
resp = httpx.get(repo_url, headers=headers, timeout=15, follow_redirects=True)
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
default_branch = resp.json().get("default_branch", "main")
|
||||
except (httpx.HTTPError, ValueError):
|
||||
cached = self._get_repo_tree(repo)
|
||||
if cached is None:
|
||||
return None
|
||||
_default_branch, tree_entries = cached
|
||||
|
||||
# Fetch the full recursive tree (branch name works as tree-ish)
|
||||
try:
|
||||
tree_url = f"https://api.github.com/repos/{repo}/git/trees/{default_branch}"
|
||||
resp = httpx.get(
|
||||
tree_url, params={"recursive": "1"},
|
||||
headers=headers, timeout=30, follow_redirects=True,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
tree_data = resp.json()
|
||||
if tree_data.get("truncated"):
|
||||
logger.debug("Git tree truncated for %s, falling back to Contents API", repo)
|
||||
return None
|
||||
except (httpx.HTTPError, ValueError):
|
||||
return None
|
||||
# Check if ANY entry lives under the target path
|
||||
prefix = f"{path}/"
|
||||
has_entries = any(
|
||||
item.get("path", "").startswith(prefix) for item in tree_entries
|
||||
)
|
||||
if not has_entries:
|
||||
# Path definitively doesn't exist in the repo — return empty
|
||||
# instead of None to skip the Contents API fallback.
|
||||
return {}
|
||||
|
||||
# Filter to blobs under our target path and fetch content
|
||||
prefix = f"{path}/"
|
||||
files: Dict[str, str] = {}
|
||||
for item in tree_data.get("tree", []):
|
||||
for item in tree_entries:
|
||||
if item.get("type") != "blob":
|
||||
continue
|
||||
item_path = item.get("path", "")
|
||||
@ -548,38 +615,14 @@ class GitHubSource(SkillSource):
|
||||
handles deeply nested directory structures like
|
||||
``cli-tool/components/skills/development/<skill>/SKILL.md``.
|
||||
"""
|
||||
# Get default branch
|
||||
try:
|
||||
resp = httpx.get(
|
||||
f"https://api.github.com/repos/{repo}",
|
||||
headers=self.auth.get_headers(),
|
||||
timeout=15,
|
||||
follow_redirects=True,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
default_branch = resp.json().get("default_branch", "main")
|
||||
except (httpx.HTTPError, json.JSONDecodeError):
|
||||
return None
|
||||
|
||||
# Get recursive tree (single API call for the entire repo)
|
||||
try:
|
||||
resp = httpx.get(
|
||||
f"https://api.github.com/repos/{repo}/git/trees/{default_branch}",
|
||||
params={"recursive": "1"},
|
||||
headers=self.auth.get_headers(),
|
||||
timeout=30,
|
||||
follow_redirects=True,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
tree_data = resp.json()
|
||||
except (httpx.HTTPError, json.JSONDecodeError):
|
||||
cached = self._get_repo_tree(repo)
|
||||
if cached is None:
|
||||
return None
|
||||
_default_branch, tree_entries = cached
|
||||
|
||||
# Look for SKILL.md files inside directories named <skill_name>
|
||||
skill_md_suffix = f"/{skill_name}/SKILL.md"
|
||||
for entry in tree_data.get("tree", []):
|
||||
for entry in tree_entries:
|
||||
if entry.get("type") != "blob":
|
||||
continue
|
||||
path = entry.get("path", "")
|
||||
@ -601,6 +644,7 @@ class GitHubSource(SkillSource):
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
return resp.text
|
||||
self._check_rate_limit_response(resp)
|
||||
except httpx.HTTPError as e:
|
||||
logger.debug("GitHub contents API fetch failed: %s", e)
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user