Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit a99089a

Browse files
filipchristiansenix-56h
andauthored
refactor: consistent cloning & pattern-handling (#388)
Co-authored-by: ix-56h <n.guintini@protonmail.com>
1 parent 360a38e commit a99089a

37 files changed

+1022
-865
lines changed

‎.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ jobs:
5151
- name: Install dependencies
5252
run: |
5353
python -m pip install --upgrade pip
54-
python -m pip install ".[dev]"
54+
python -m pip install ".[dev,server]"
5555
5656
- name: Run tests
5757
if: ${{ matrix.coverage != true }}

‎.github/workflows/publish_to_pypi.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name: Publish to PyPI
22

33
on:
44
release:
5-
types: [created] # Run when you click "Publish release"
5+
types: [created] # Run when you click "Publish release"
66
workflow_dispatch: # ... or run it manually from the Actions tab
77

88
permissions:
@@ -38,7 +38,7 @@ jobs:
3838
name: dist
3939
path: dist/
4040

41-
# Publish to PyPI (only if "dist/" succeeded)
41+
# Publish to PyPI (only if "dist/" succeeded)
4242
pypi-publish:
4343
needs: release-build
4444
runs-on: ubuntu-latest

‎.pre-commit-config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,12 @@ repos:
122122
pytest-asyncio,
123123
pytest-mock,
124124
python-dotenv,
125+
'sentry-sdk[fastapi]',
125126
slowapi,
126127
starlette>=0.40.0,
128+
strenum; python_version < '3.11',
127129
tiktoken>=0.7.0,
130+
typing_extensions>= 4.0.0; python_version < '3.10',
128131
uvicorn>=0.11.7,
129132
]
130133

@@ -144,9 +147,12 @@ repos:
144147
pytest-asyncio,
145148
pytest-mock,
146149
python-dotenv,
150+
'sentry-sdk[fastapi]',
147151
slowapi,
148152
starlette>=0.40.0,
153+
strenum; python_version < '3.11',
149154
tiktoken>=0.7.0,
155+
typing_extensions>= 4.0.0; python_version < '3.10',
150156
uvicorn>=0.11.7,
151157
]
152158

‎CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ If you ever get stuck, reach out on [Discord](https://discord.com/invite/zerRaGK
3232
```bash
3333
python -m venv .venv
3434
source .venv/bin/activate
35-
pip install -e ".[dev]"
35+
pip install -e ".[dev,server]"
3636
pre-commit install
3737
```
3838

‎README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,14 @@ You can install it using `pip`:
6666
pip install gitingest
6767
```
6868

69+
or
70+
71+
```bash
72+
pip install gitingest[server]
73+
```
74+
75+
to include server dependencies for self-hosting.
76+
6977
However, it might be a good idea to use `pipx` to install it.
7078
You can install `pipx` using your preferred package manager.
7179

‎pyproject.toml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,14 @@ readme = {file = "README.md", content-type = "text/markdown" }
66
requires-python = ">= 3.8"
77
dependencies = [
88
"click>=8.0.0",
9-
"fastapi[standard]>=0.109.1", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38)
109
"httpx",
1110
"pathspec>=0.12.1",
1211
"pydantic",
1312
"python-dotenv",
14-
"slowapi",
1513
"starlette>=0.40.0", # Minimum safe release (https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw)
14+
"strenum; python_version < '3.11'",
1615
"tiktoken>=0.7.0", # Support for o200k_base encoding
1716
"typing_extensions>= 4.0.0; python_version < '3.10'",
18-
"uvicorn>=0.11.7", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2020-150)
19-
"prometheus-client",
2017
]
2118

2219
license = {file = "LICENSE"}
@@ -46,6 +43,14 @@ dev = [
4643
"pytest-mock",
4744
]
4845

46+
server = [
47+
"fastapi[standard]>=0.109.1", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38)
48+
"prometheus-client",
49+
"sentry-sdk[fastapi]",
50+
"slowapi",
51+
"uvicorn>=0.11.7", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2020-150)
52+
]
53+
4954
[project.scripts]
5055
gitingest = "gitingest.__main__:main"
5156

‎src/gitingest/__init__.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
"""Gitingest: A package for ingesting data from Git repositories."""
22

3-
from gitingest.clone import clone_repo
43
from gitingest.entrypoint import ingest, ingest_async
5-
from gitingest.ingestion import ingest_query
6-
from gitingest.query_parser import parse_query
74

8-
__all__ = ["clone_repo", "ingest", "ingest_async", "ingest_query", "parse_query"]
5+
__all__ = ["ingest", "ingest_async"]

‎src/gitingest/clone.py

Lines changed: 17 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@
88
from gitingest.config import DEFAULT_TIMEOUT
99
from gitingest.utils.git_utils import (
1010
check_repo_exists,
11+
checkout_partial_clone,
1112
create_git_auth_header,
1213
create_git_command,
1314
ensure_git_installed,
1415
is_github_host,
16+
resolve_commit,
1517
run_command,
1618
)
17-
from gitingest.utils.os_utils import ensure_directory
19+
from gitingest.utils.os_utils import ensure_directory_exists_or_create
1820
from gitingest.utils.timeout_wrapper import async_timeout
1921

2022
if TYPE_CHECKING:
@@ -45,71 +47,42 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
4547
# Extract and validate query parameters
4648
url: str = config.url
4749
local_path: str = config.local_path
48-
commit: str | None = config.commit
49-
branch: str | None = config.branch
50-
tag: str | None = config.tag
5150
partial_clone: bool = config.subpath != "/"
5251

53-
# Create parent directory if it doesn't exist
54-
await ensure_directory(Path(local_path).parent)
52+
awaitensure_git_installed()
53+
await ensure_directory_exists_or_create(Path(local_path).parent)
5554

56-
# Check if the repository exists
5755
if not await check_repo_exists(url, token=token):
5856
msg = "Repository not found. Make sure it is public or that you have provided a valid token."
5957
raise ValueError(msg)
6058

59+
commit = await resolve_commit(config, token=token)
60+
6161
clone_cmd = ["git"]
6262
if token and is_github_host(url):
6363
clone_cmd += ["-c", create_git_auth_header(token, url=url)]
6464

65-
clone_cmd += ["clone", "--single-branch"]
66-
67-
if config.include_submodules:
68-
clone_cmd += ["--recurse-submodules"]
69-
65+
clone_cmd += ["clone", "--single-branch", "--no-checkout", "--depth=1"]
7066
if partial_clone:
7167
clone_cmd += ["--filter=blob:none", "--sparse"]
7268

73-
# Shallow clone unless a specific commit is requested
74-
if not commit:
75-
clone_cmd += ["--depth=1"]
76-
77-
# Prefer tag over branch when both are provided
78-
if tag:
79-
clone_cmd += ["--branch", tag]
80-
elif branch and branch.lower() not in ("main", "master"):
81-
clone_cmd += ["--branch", branch]
82-
8369
clone_cmd += [url, local_path]
8470

8571
# Clone the repository
86-
await ensure_git_installed()
8772
await run_command(*clone_cmd)
8873

8974
# Checkout the subpath if it is a partial clone
9075
if partial_clone:
91-
await _checkout_partial_clone(config, token)
76+
await checkout_partial_clone(config, token=token)
9277

93-
# Checkout the commit if it is provided
94-
if commit:
95-
checkout_cmd = create_git_command(["git"], local_path, url, token)
96-
await run_command(*checkout_cmd, "checkout", commit)
78+
git = create_git_command(["git"], local_path, url, token)
9779

80+
# Ensure the commit is locally available
81+
await run_command(*git, "fetch", "--depth=1", "origin", commit)
9882

99-
asyncdef_checkout_partial_clone(config: CloneConfig, token: str|None) ->None:
100-
"""Configure sparse-checkout for a partially cloned repository.
83+
# Write the work-tree at that commit
84+
awaitrun_command(*git, "checkout", commit)
10185

102-
Parameters
103-
----------
104-
config : CloneConfig
105-
The configuration for cloning the repository, including subpath and blob flag.
106-
token : str | None
107-
GitHub personal access token (PAT) for accessing private repositories.
108-
109-
"""
110-
subpath = config.subpath.lstrip("/")
111-
if config.blob:
112-
# Remove the file name from the subpath when ingesting from a file url (e.g. blob/branch/path/file.txt)
113-
subpath = str(Path(subpath).parent.as_posix())
114-
checkout_cmd = create_git_command(["git"], config.local_path, config.url, token)
115-
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)
86+
# Update submodules
87+
if config.include_submodules:
88+
await run_command(*git, "submodule", "update", "--init", "--recursive", "--depth=1")

‎src/gitingest/entrypoint.py

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,30 @@
33
from __future__ import annotations
44

55
import asyncio
6+
import errno
67
import shutil
8+
import stat
79
import sys
810
import warnings
911
from contextlib import asynccontextmanager
1012
from pathlib import Path
11-
from typing import AsyncGenerator
13+
from typing import TYPE_CHECKING, AsyncGenerator, Callable
14+
from urllib.parse import urlparse
1215

1316
from gitingest.clone import clone_repo
1417
from gitingest.config import MAX_FILE_SIZE
1518
from gitingest.ingestion import ingest_query
16-
from gitingest.query_parser import IngestionQuery, parse_query
19+
from gitingest.query_parser import parse_local_dir_path, parse_remote_repo
1720
from gitingest.utils.auth import resolve_token
21+
from gitingest.utils.compat_func import removesuffix
1822
from gitingest.utils.ignore_patterns import load_ignore_patterns
23+
from gitingest.utils.pattern_utils import process_patterns
24+
from gitingest.utils.query_parser_utils import KNOWN_GIT_HOSTS
25+
26+
if TYPE_CHECKING:
27+
from types import TracebackType
28+
29+
from gitingest.schemas import IngestionQuery
1930

2031

2132
async def ingest_async(
@@ -74,13 +85,23 @@ async def ingest_async(
7485
"""
7586
token = resolve_token(token)
7687

77-
query: IngestionQuery = await parse_query(
78-
source=source,
79-
max_file_size=max_file_size,
80-
from_web=False,
88+
source = removesuffix(source.strip(), ".git")
89+
90+
# Determine the parsing method based on the source type
91+
if urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS):
92+
# We either have a full URL or a domain-less slug
93+
query = await parse_remote_repo(source, token=token)
94+
query.include_submodules = include_submodules
95+
_override_branch_and_tag(query, branch=branch, tag=tag)
96+
97+
else:
98+
# Local path scenario
99+
query = parse_local_dir_path(source)
100+
101+
query.max_file_size = max_file_size
102+
query.ignore_patterns, query.include_patterns = process_patterns(
103+
exclude_patterns=exclude_patterns,
81104
include_patterns=include_patterns,
82-
ignore_patterns=exclude_patterns,
83-
token=token,
84105
)
85106

86107
if query.url:
@@ -235,17 +256,49 @@ async def _clone_repo_if_remote(query: IngestionQuery, *, token: str | None) ->
235256
GitHub personal access token (PAT) for accessing private repositories.
236257
237258
"""
259+
kwargs = {}
260+
if sys.version_info >= (3, 12):
261+
kwargs["onexc"] = _handle_remove_readonly
262+
else:
263+
kwargs["onerror"] = _handle_remove_readonly
264+
238265
if query.url:
239266
clone_config = query.extract_clone_config()
240267
await clone_repo(clone_config, token=token)
241268
try:
242269
yield
243270
finally:
244-
shutil.rmtree(query.local_path.parent)
271+
shutil.rmtree(query.local_path.parent, **kwargs)
245272
else:
246273
yield
247274

248275

276+
def _handle_remove_readonly(
277+
func: Callable,
278+
path: str,
279+
exc_info: BaseException | tuple[type[BaseException], BaseException, TracebackType],
280+
) -> None:
281+
"""Handle permission errors raised by ``shutil.rmtree()``.
282+
283+
* Makes the target writable (removes the read-only attribute).
284+
* Retries the original operation (``func``) once.
285+
286+
"""
287+
# 'onerror' passes a (type, value, tb) tuple; 'onexc' passes the exception
288+
if isinstance(exc_info, tuple): # 'onerror' (Python <3.12)
289+
exc: BaseException = exc_info[1]
290+
else: # 'onexc' (Python 3.12+)
291+
exc = exc_info
292+
293+
# Handle only'Permission denied' and 'Operation not permitted'
294+
if not isinstance(exc, OSError) or exc.errno not in {errno.EACCES, errno.EPERM}:
295+
raise exc
296+
297+
# Make the target writable
298+
Path(path).chmod(stat.S_IWRITE)
299+
func(path)
300+
301+
249302
async def _write_output(tree: str, content: str, target: str | None) -> None:
250303
"""Write combined output to ``target`` (``"-"`` ⇒ stdout).
251304

‎src/gitingest/ingestion.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from gitingest.utils.ingestion_utils import _should_exclude, _should_include
1212

1313
if TYPE_CHECKING:
14-
from gitingest.query_parser import IngestionQuery
14+
from gitingest.schemas import IngestionQuery
1515

1616

1717
def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /