diff --git a/.gitignore b/.gitignore index af066ef..bbdf54d 100755 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,6 @@ src/codetext.egg-info/* *.pyc *.so *.whl - +.idea +.vscode +*.iml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..2840782 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,6 @@ +# Default ignored files +/shelf/ +/workspace.xml +.idea +.vscode +*.iml \ No newline at end of file diff --git a/HISTORY.md b/HISTORY.md index 7ca2548..a8e5025 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,36 +2,55 @@ Releases ======== -Version 0.0.1 +Version 0.0.9 ============= -Release date: Nov 9, 2022 +Release date: Jul 1, 2024 +* Skip building language binaries from source -* Language parser for Java, Python, JavaScript, PHP, Golang, Ruby, C++, C#, C - * get_docstring - * get_class_list, get_function_list - * get_class_metadata, get_function_metadata -* Clean docstring function -* Data preprocessing source code -* Tree-sitter utils: build_language, parse_code +Version 0.0.8 +============= +Release date: Aug 17, 2023 -Version 0.0.2 +* Update format codetext_cli +* Update PythonParser: Handle class definitions with empty argument list class ABC() +* Add Javascript undeclared functions +* Add PHP interface +* Add Ruby actions with block parameters + +Version 0.0.7 ============= -Release date: Nov 25, 2022 +Release date: Jul 5, 2023 -* Language parser for Rust - * get_docstring - * get_class_list, get_function_list - * get_class_metadata, get_function_metadata -* Processing utils: - * extract_docstring - * extract_node - * get_line_definitions - * get_node_definitions - * process_raw_node -* Postprocessing: - * Merge file (from batches) - * Split into train/test/valid (by #sample category) - * Deduplicate sample +* Update all class extractor format (using dict instead of list) +* Fix missing identifier, parameter in C, C#, Java parser +* Implement CLI + +Version 0.0.6 +============= +Release date: Jan 9, 2023 + +* Add tree sitter utils (in codetext.parser) +* Replace all `match_from_span` to `get_node_text` +* Replace all `traverse_type` to `get_node_by_kind` +* Fix `CppParser.get_function_metadata` missing `param_type` and `param_identifier` +* Update return metadata from all parser + +Version 0.0.5 +============= +Release date: Dec 12, 2022 + +* Fix package import path +* Adding auto build workflow +* Seperate codetext parser with processing source code +* Fix `remove_comment_delimiter` remove leading whitespace +* Update unittest for parser and utilites + +Version 0.0.4 +============= +Release date: Dec 2, 2022 + +* Fix main package root path +* Loosen `docstring_parser` dependency Version 0.0.3 ============= @@ -51,47 +70,33 @@ Release date: Dec 2, 2022 * check_contain_many_uppercase_word * check_contain_many_long_word -Version 0.0.4 -============= -Release date: Dec 2, 2022 - -* Fix main package root path -* Loosen `docstring_parser` dependency - -Version 0.0.5 -============= -Release data: Dec 12, 2022 - -* Fix package import path -* Adding auto build workflow -* Seperate codetext parser with processing source code -* Fix `remove_comment_delimiter` remove leading whitespace -* Update unittest for parser and utilites - -Version 0.0.6 -============= -Release data: Jan 9, 2023 - -* Add tree sitter utils (in codetext.parser) -* Replace all `match_from_span` to `get_node_text` -* Replace all `traverse_type` to `get_node_by_kind` -* Fix `CppParser.get_function_metadata` missing `param_type` and `param_identifier` -* Update return metadata from all parser - -Version 0.0.7 +Version 0.0.2 ============= -Release data: Jul 5, 2023 +Release date: Nov 25, 2022 -* Update all class extractor format (using dict instead of list) -* Fix missing identifier, parameter in C, C#, Java parser -* Implement CLI +* Language parser for Rust + * get_docstring + * get_class_list, get_function_list + * get_class_metadata, get_function_metadata +* Processing utils: + * extract_docstring + * extract_node + * get_line_definitions + * get_node_definitions + * process_raw_node +* Postprocessing: + * Merge file (from batches) + * Split into train/test/valid (by #sample category) + * Deduplicate sample -Version 0.0.8 +Version 0.0.1 ============= -Release data: Aug 17, 2023 +Release date: Nov 9, 2022 -* Update format codetext_cli -* Update PythonParser: Handle class definitions with empty argument list class ABC() -* Add Javascript undeclared functions -* Add PHP interface -* Add Ruby actions with block parameters \ No newline at end of file +* Language parser for Java, Python, JavaScript, PHP, Golang, Ruby, C++, C#, C + * get_docstring + * get_class_list, get_function_list + * get_class_metadata, get_function_metadata +* Clean docstring function +* Data preprocessing source code +* Tree-sitter utils: build_language, parse_code diff --git a/pyproject.toml b/pyproject.toml index d8bb24d..4baf007 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "codetext" -version = "0.0.8" +version = "0.0.9" authors = [ { name="Dung Manh Nguyen", email="dungnm.workspace@gmail.com" }, ] diff --git a/requirements.txt b/requirements.txt index d438040..4bc4c06 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ # for preprocessing -tree-sitter +tree-sitter==0.20.4 tabulate Levenshtein langdetect bs4 +tree_sitter_languages==1.10.2 diff --git a/src/codetext/codetext_cli.py b/src/codetext/codetext_cli.py index 36e25ba..43d8824 100644 --- a/src/codetext/codetext_cli.py +++ b/src/codetext/codetext_cli.py @@ -53,15 +53,15 @@ def parse_file(file_path: str, language: str = None, verbose: bool = False) -> L cls_info["code"] = get_node_text(_cls) cls_method = [] - method_list = parser.get_function_list(_cls) - for method in method_list: + current_class_methods = parser.get_function_list(_cls) + for method in current_class_methods: method_info = parser.get_function_metadata(method) method_info['code'] = get_node_text(method) cls_method.append(method_info) cls_info["method"] = cls_method cls_metadata.append(cls_info) - method_list.extend(method_list) + method_list.extend(current_class_methods) fn_list: List = parser.get_function_list(root_node) for node in fn_list[:]: diff --git a/src/codetext/utils/utils.py b/src/codetext/utils/utils.py index d330ecb..5975897 100644 --- a/src/codetext/utils/utils.py +++ b/src/codetext/utils/utils.py @@ -92,13 +92,19 @@ def parse_code(raw_code: str, language: str='Auto', tree_sitter_path: str=None) calling_script_path = Path(inspect.getframeinfo(sys._getframe(1)).filename) load_path = str(calling_script_path.parent) - ts_lang_path = os.path.join(load_path, 'tree-sitter', f'{language}.so') - if not os.path.exists(ts_lang_path): - logger.warning(f"Not found `{language}.so` in `{load_path}/tree-sitter/`, attemp to build language") - build_language(language, load_path) - + # Get parser from languages parser = Parser() - language = Language(load_path + f"/tree-sitter/{language}.so", language) + try: + from tree_sitter_languages import get_language, get_parser + language = get_language(language) + except ImportError: + # Work-around when pre-built binaries wheels for tree-sitter-languages are not available + logger.warning(f"Troubled importing 'tree-sitter-languages', attemp to look for pre-built binaries in the workspace") + ts_lang_path = os.path.join(load_path, 'tree-sitter', f'{language}.so') + if not os.path.exists(ts_lang_path): + logger.warning(f"Not found `{language}.so` in `{load_path}/tree-sitter/`, attemp to build language") + build_language(language, load_path) + language = Language(load_path + f"/tree-sitter/{language}.so", language) parser.set_language(language) if isinstance(raw_code, str): diff --git a/tests/setup.py b/tests/setup.py index 4a7f6aa..d9516a6 100755 --- a/tests/setup.py +++ b/tests/setup.py @@ -1,8 +1,12 @@ from ..src.codetext.utils import build_language - +from tree_sitter_languages import get_language, get_parser if __name__ == '__main__': lang_list = ['python', 'cpp', 'java', 'c-sharp', 'ruby', 'rust', 'javascript', 'php', 'go'] for lang in lang_list: - build_language(lang) + # build_language(lang) + try: + get_parser(get_language(lang)) + except: + build_language(lang) diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index d4a4ba4..af7288c 100755 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -14,8 +14,6 @@ def test_parse_code(self): def sum_2_num(a, b): return a + b """ - - build_language(language='python') parse_code(sample, 'python')