From ced3bf6e83d0975a2cc31a5d0f0fc13cc0c62ead Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Mon, 1 Jul 2024 10:46:45 +0700 Subject: [PATCH 1/4] # Please enter the commit message for your changes. Lines starting # with '#' will be ignored, and an empty message aborts the commit. # # On branch dev/v0.0.9 # Changes to be committed: # modified: requirements.txt # Update `requirements` - Current version of `codetext` can only work with `tree-sitter==0.20.4` - Replace current language builders with `tree_sitter_languages==1.10.2` --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d438040..4bc4c06 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ # for preprocessing -tree-sitter +tree-sitter==0.20.4 tabulate Levenshtein langdetect bs4 +tree_sitter_languages==1.10.2 From 44fafa1694a11a5bc36ade7db7bc242f6b78a4c7 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Mon, 1 Jul 2024 11:09:56 +0700 Subject: [PATCH 2/4] Skip language build, use pre-built `get_language` --- src/codetext/utils/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/codetext/utils/utils.py b/src/codetext/utils/utils.py index 16c94f8..98d57e5 100644 --- a/src/codetext/utils/utils.py +++ b/src/codetext/utils/utils.py @@ -96,7 +96,7 @@ def parse_code(raw_code: str, language: str='Auto', tree_sitter_path: str=None) parser = Parser() try: from tree_sitter_languages import get_language, get_parser - parser = get_parser(get_language(language)) + language = get_language(language) except ImportError: # Work-around when pre-built binaries wheels for tree-sitter-languages are not available logger.warning(f"Troubled importing 'tree-sitter-languages', attemp to look for pre-built binaries in the workspace") @@ -104,9 +104,9 @@ def parse_code(raw_code: str, language: str='Auto', tree_sitter_path: str=None) if not os.path.exists(ts_lang_path): logger.warning(f"Not found `{language}.so` in `{load_path}/tree-sitter/`, attemp to build language") build_language(language, load_path) - language = Language(load_path + f"/tree-sitter/{language}.so", language) - parser.set_language(language) - + language = Language(load_path + f"/tree-sitter/{language}.so", language) + parser.set_language(language) + if isinstance(raw_code, str): raw_code = bytes(raw_code, 'utf8') elif isinstance(raw_code, bytes): From 3daaa8ac83d468888d7467bd04b9a6ade7fb1135 Mon Sep 17 00:00:00 2001 From: minhna1112 Date: Mon, 1 Jul 2024 11:11:23 +0700 Subject: [PATCH 3/4] Fix indentation --- src/codetext/utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/codetext/utils/utils.py b/src/codetext/utils/utils.py index 98d57e5..5975897 100644 --- a/src/codetext/utils/utils.py +++ b/src/codetext/utils/utils.py @@ -104,7 +104,7 @@ def parse_code(raw_code: str, language: str='Auto', tree_sitter_path: str=None) if not os.path.exists(ts_lang_path): logger.warning(f"Not found `{language}.so` in `{load_path}/tree-sitter/`, attemp to build language") build_language(language, load_path) - language = Language(load_path + f"/tree-sitter/{language}.so", language) + language = Language(load_path + f"/tree-sitter/{language}.so", language) parser.set_language(language) if isinstance(raw_code, str): From d2a7365f7f944650e84d9fdb6b6794d6c5ea620b Mon Sep 17 00:00:00 2001 From: nmd2k Date: Mon, 1 Jul 2024 08:41:14 +0000 Subject: [PATCH 4/4] update history --- HISTORY.md | 133 +++++++++++++++++++++++++------------------------ pyproject.toml | 5 +- 2 files changed, 71 insertions(+), 67 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 7ca2548..a8e5025 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,36 +2,55 @@ Releases ======== -Version 0.0.1 +Version 0.0.9 ============= -Release date: Nov 9, 2022 +Release date: Jul 1, 2024 +* Skip building language binaries from source -* Language parser for Java, Python, JavaScript, PHP, Golang, Ruby, C++, C#, C - * get_docstring - * get_class_list, get_function_list - * get_class_metadata, get_function_metadata -* Clean docstring function -* Data preprocessing source code -* Tree-sitter utils: build_language, parse_code +Version 0.0.8 +============= +Release date: Aug 17, 2023 -Version 0.0.2 +* Update format codetext_cli +* Update PythonParser: Handle class definitions with empty argument list class ABC() +* Add Javascript undeclared functions +* Add PHP interface +* Add Ruby actions with block parameters + +Version 0.0.7 ============= -Release date: Nov 25, 2022 +Release date: Jul 5, 2023 -* Language parser for Rust - * get_docstring - * get_class_list, get_function_list - * get_class_metadata, get_function_metadata -* Processing utils: - * extract_docstring - * extract_node - * get_line_definitions - * get_node_definitions - * process_raw_node -* Postprocessing: - * Merge file (from batches) - * Split into train/test/valid (by #sample category) - * Deduplicate sample +* Update all class extractor format (using dict instead of list) +* Fix missing identifier, parameter in C, C#, Java parser +* Implement CLI + +Version 0.0.6 +============= +Release date: Jan 9, 2023 + +* Add tree sitter utils (in codetext.parser) +* Replace all `match_from_span` to `get_node_text` +* Replace all `traverse_type` to `get_node_by_kind` +* Fix `CppParser.get_function_metadata` missing `param_type` and `param_identifier` +* Update return metadata from all parser + +Version 0.0.5 +============= +Release date: Dec 12, 2022 + +* Fix package import path +* Adding auto build workflow +* Seperate codetext parser with processing source code +* Fix `remove_comment_delimiter` remove leading whitespace +* Update unittest for parser and utilites + +Version 0.0.4 +============= +Release date: Dec 2, 2022 + +* Fix main package root path +* Loosen `docstring_parser` dependency Version 0.0.3 ============= @@ -51,47 +70,33 @@ Release date: Dec 2, 2022 * check_contain_many_uppercase_word * check_contain_many_long_word -Version 0.0.4 -============= -Release date: Dec 2, 2022 - -* Fix main package root path -* Loosen `docstring_parser` dependency - -Version 0.0.5 -============= -Release data: Dec 12, 2022 - -* Fix package import path -* Adding auto build workflow -* Seperate codetext parser with processing source code -* Fix `remove_comment_delimiter` remove leading whitespace -* Update unittest for parser and utilites - -Version 0.0.6 -============= -Release data: Jan 9, 2023 - -* Add tree sitter utils (in codetext.parser) -* Replace all `match_from_span` to `get_node_text` -* Replace all `traverse_type` to `get_node_by_kind` -* Fix `CppParser.get_function_metadata` missing `param_type` and `param_identifier` -* Update return metadata from all parser - -Version 0.0.7 +Version 0.0.2 ============= -Release data: Jul 5, 2023 +Release date: Nov 25, 2022 -* Update all class extractor format (using dict instead of list) -* Fix missing identifier, parameter in C, C#, Java parser -* Implement CLI +* Language parser for Rust + * get_docstring + * get_class_list, get_function_list + * get_class_metadata, get_function_metadata +* Processing utils: + * extract_docstring + * extract_node + * get_line_definitions + * get_node_definitions + * process_raw_node +* Postprocessing: + * Merge file (from batches) + * Split into train/test/valid (by #sample category) + * Deduplicate sample -Version 0.0.8 +Version 0.0.1 ============= -Release data: Aug 17, 2023 +Release date: Nov 9, 2022 -* Update format codetext_cli -* Update PythonParser: Handle class definitions with empty argument list class ABC() -* Add Javascript undeclared functions -* Add PHP interface -* Add Ruby actions with block parameters \ No newline at end of file +* Language parser for Java, Python, JavaScript, PHP, Golang, Ruby, C++, C#, C + * get_docstring + * get_class_list, get_function_list + * get_class_metadata, get_function_metadata +* Clean docstring function +* Data preprocessing source code +* Tree-sitter utils: build_language, parse_code diff --git a/pyproject.toml b/pyproject.toml index 9db3f28..4baf007 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,12 +17,11 @@ classifiers = [ "Operating System :: OS Independent", ] dependencies = [ - "tree-sitter==0.20.4", + "tree-sitter>=0.20", "Levenshtein>=0.20", "langdetect>=1.0.0", "bs4>=0.0.1", - "tabulate>=0.9.0", - "tree_sitter_languages>=1.10.0" + "tabulate>=0.9.0" ] [project.urls]

AltStyle によって変換されたページ (->オリジナル) /