diff --git a/HISTORY.md b/HISTORY.md index 7ca2548..a8e5025 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,36 +2,55 @@ Releases ======== -Version 0.0.1 +Version 0.0.9 ============= -Release date: Nov 9, 2022 +Release date: Jul 1, 2024 +* Skip building language binaries from source -* Language parser for Java, Python, JavaScript, PHP, Golang, Ruby, C++, C#, C - * get_docstring - * get_class_list, get_function_list - * get_class_metadata, get_function_metadata -* Clean docstring function -* Data preprocessing source code -* Tree-sitter utils: build_language, parse_code +Version 0.0.8 +============= +Release date: Aug 17, 2023 -Version 0.0.2 +* Update format codetext_cli +* Update PythonParser: Handle class definitions with empty argument list class ABC() +* Add Javascript undeclared functions +* Add PHP interface +* Add Ruby actions with block parameters + +Version 0.0.7 ============= -Release date: Nov 25, 2022 +Release date: Jul 5, 2023 -* Language parser for Rust - * get_docstring - * get_class_list, get_function_list - * get_class_metadata, get_function_metadata -* Processing utils: - * extract_docstring - * extract_node - * get_line_definitions - * get_node_definitions - * process_raw_node -* Postprocessing: - * Merge file (from batches) - * Split into train/test/valid (by #sample category) - * Deduplicate sample +* Update all class extractor format (using dict instead of list) +* Fix missing identifier, parameter in C, C#, Java parser +* Implement CLI + +Version 0.0.6 +============= +Release date: Jan 9, 2023 + +* Add tree sitter utils (in codetext.parser) +* Replace all `match_from_span` to `get_node_text` +* Replace all `traverse_type` to `get_node_by_kind` +* Fix `CppParser.get_function_metadata` missing `param_type` and `param_identifier` +* Update return metadata from all parser + +Version 0.0.5 +============= +Release date: Dec 12, 2022 + +* Fix package import path +* Adding auto build workflow +* Seperate codetext parser with processing source code +* Fix `remove_comment_delimiter` remove leading whitespace +* Update unittest for parser and utilites + +Version 0.0.4 +============= +Release date: Dec 2, 2022 + +* Fix main package root path +* Loosen `docstring_parser` dependency Version 0.0.3 ============= @@ -51,47 +70,33 @@ Release date: Dec 2, 2022 * check_contain_many_uppercase_word * check_contain_many_long_word -Version 0.0.4 -============= -Release date: Dec 2, 2022 - -* Fix main package root path -* Loosen `docstring_parser` dependency - -Version 0.0.5 -============= -Release data: Dec 12, 2022 - -* Fix package import path -* Adding auto build workflow -* Seperate codetext parser with processing source code -* Fix `remove_comment_delimiter` remove leading whitespace -* Update unittest for parser and utilites - -Version 0.0.6 -============= -Release data: Jan 9, 2023 - -* Add tree sitter utils (in codetext.parser) -* Replace all `match_from_span` to `get_node_text` -* Replace all `traverse_type` to `get_node_by_kind` -* Fix `CppParser.get_function_metadata` missing `param_type` and `param_identifier` -* Update return metadata from all parser - -Version 0.0.7 +Version 0.0.2 ============= -Release data: Jul 5, 2023 +Release date: Nov 25, 2022 -* Update all class extractor format (using dict instead of list) -* Fix missing identifier, parameter in C, C#, Java parser -* Implement CLI +* Language parser for Rust + * get_docstring + * get_class_list, get_function_list + * get_class_metadata, get_function_metadata +* Processing utils: + * extract_docstring + * extract_node + * get_line_definitions + * get_node_definitions + * process_raw_node +* Postprocessing: + * Merge file (from batches) + * Split into train/test/valid (by #sample category) + * Deduplicate sample -Version 0.0.8 +Version 0.0.1 ============= -Release data: Aug 17, 2023 +Release date: Nov 9, 2022 -* Update format codetext_cli -* Update PythonParser: Handle class definitions with empty argument list class ABC() -* Add Javascript undeclared functions -* Add PHP interface -* Add Ruby actions with block parameters \ No newline at end of file +* Language parser for Java, Python, JavaScript, PHP, Golang, Ruby, C++, C#, C + * get_docstring + * get_class_list, get_function_list + * get_class_metadata, get_function_metadata +* Clean docstring function +* Data preprocessing source code +* Tree-sitter utils: build_language, parse_code diff --git a/pyproject.toml b/pyproject.toml index b6a68f6..4baf007 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,8 +21,7 @@ dependencies = [ "Levenshtein>=0.20", "langdetect>=1.0.0", "bs4>=0.0.1", - "tabulate>=0.9.0", - "tree_sitter_languages>=1.10.0" + "tabulate>=0.9.0" ] [project.urls] diff --git a/requirements.txt b/requirements.txt index 9eb91b2..4bc4c06 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # for preprocessing -tree-sitter +tree-sitter==0.20.4 tabulate Levenshtein langdetect bs4 -tree-sitter-languages \ No newline at end of file +tree_sitter_languages==1.10.2 diff --git a/src/codetext/utils/utils.py b/src/codetext/utils/utils.py index 16c94f8..5975897 100644 --- a/src/codetext/utils/utils.py +++ b/src/codetext/utils/utils.py @@ -96,7 +96,7 @@ def parse_code(raw_code: str, language: str='Auto', tree_sitter_path: str=None) parser = Parser() try: from tree_sitter_languages import get_language, get_parser - parser = get_parser(get_language(language)) + language = get_language(language) except ImportError: # Work-around when pre-built binaries wheels for tree-sitter-languages are not available logger.warning(f"Troubled importing 'tree-sitter-languages', attemp to look for pre-built binaries in the workspace") @@ -105,8 +105,8 @@ def parse_code(raw_code: str, language: str='Auto', tree_sitter_path: str=None) logger.warning(f"Not found `{language}.so` in `{load_path}/tree-sitter/`, attemp to build language") build_language(language, load_path) language = Language(load_path + f"/tree-sitter/{language}.so", language) - parser.set_language(language) - + parser.set_language(language) + if isinstance(raw_code, str): raw_code = bytes(raw_code, 'utf8') elif isinstance(raw_code, bytes):