diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml new file mode 100644 index 00000000..a557a69b --- /dev/null +++ b/.github/workflows/pages.yml @@ -0,0 +1,72 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +# Sample workflow for building and deploying a Jekyll site to GitHub Pages +name: Deploy Jekyll site to Pages + +on: + push: + branches: ["main", "doc-preview"] + paths: + - "doc/**" + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow one concurrent deployment +concurrency: + group: "pages" + cancel-in-progress: true + +jobs: + # Build job + build: + runs-on: ubuntu-latest + defaults: + run: + working-directory: doc + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Setup Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: '3.1' # Not needed with a .ruby-version file + bundler-cache: true # runs 'bundle install' and caches installed gems automatically + cache-version: 0 # Increment this number if you need to re-download cached gems + working-directory: '${{ github.workspace }}/doc' + - name: Generate COREF API Documents + run: python3 tools/build.py + - name: Setup Pages + id: pages + uses: actions/configure-pages@v3 + - name: Build with Jekyll + # Outputs to the './_site' directory by default + run: bundle exec jekyll build --baseurl "${{ steps.pages.outputs.base_path }}" + env: + JEKYLL_ENV: production + - name: Upload artifact + # Automatically uploads an artifact from the './_site' directory by default + uses: actions/upload-pages-artifact@v1 + with: + path: "doc/_site/" + + # Deployment job + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v2 diff --git a/README.md b/README.md index 6bc3553b..851f1bd2 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,7 @@ CodeFuse-Query为CodeFuse代码大模型提供了以下数据清洗能力: - [安装、配置、运行](./doc/3_install_and_run.md) - [Gödel查询语言介绍](./doc/4_godelscript_language.md) - [VSCode开发插件](./doc/5_toolchain.md) +- [COREF API](https://codefuse-ai.github.io/CodeFuse-Query/godel-api/coref_library_reference.html) ## 教程 (tutorial) - [在线教程](./tutorial/README.md) diff --git a/doc/1_abstract.md b/doc/1_abstract.md index 81f68660..d21dc424 100644 --- a/doc/1_abstract.md +++ b/doc/1_abstract.md @@ -1,7 +1,9 @@ 随着大规模软件开发的普及,对可扩展且易于适应的静态代码分析技术的需求正在加大。传统的静态分析工具,如 Clang Static Analyzer (CSA) 或 PMD,在检查编程规则或样式问题方面已经展现出了良好的效果。然而,这些工具通常是为了满足特定的目标而设计的,往往无法满足现代软件开发环境中多变和多元化的需求。这些需求可以涉及服务质量 (QoS)、各种编程语言、不同的算法需求,以及各种性能需求。例如,安全团队可能需要复杂的算法,如上下文敏感的污点分析,来审查较小的代码库,而项目经理可能需要一种相对较轻的算法,例如计算圈复杂度的算法,以在较大的代码库上测量开发人员的生产力。 + 这些多元化的需求,加上大型组织中常见的计算资源限制,构成了一项重大的挑战。由于传统工具采用的是问题特定的计算方式,往往无法在这种环境中实现扩展。因此,我们推出了 CodeQuery,这是一个专为大规模静态分析设计的集中式数据平台。 在 CodeQuery 的实现中,我们把源代码和分析结果看作数据,把执行过程看作大数据处理,这与传统的以工具为中心的方法有着显著的不同。我们利用大型组织中的常见系统,如数据仓库、MaxCompute 和 Hive 等数据计算设施、OSS 对象存储和 Kubernetes 等灵活计算资源,让 CodeQuery 能够无缝地融入这些系统中。这种方法使 CodeQuery 高度可维护和可扩展,能够支持多元化的需求,并有效应对不断变化的需求。此外,CodeQuery 的开放架构鼓励各种内部系统之间的互操作性,实现了无缝的交互和数据交换。这种集成和交互能力不仅提高了组织内部的自动化程度,也提高了效率,降低了手动错误的可能性。通过打破信息孤岛,推动更互联、更自动化的环境,CodeQuery 显著提高了软件开发过程的整体生产力和效率。 此外,CodeQuery 的以数据为中心的方法在处理静态源代码分析的领域特定挑战时具有独特的优势。例如,源代码通常是一个高度结构化和互联的数据集,与其他代码和配置文件有强烈的信息和连接。将代码视为数据,CodeQuery 可以巧妙地处理这些问题,这使得它特别适合在大型组织中使用,其中代码库持续但逐步地进行演变,大部分代码在每天进行微小的改动同时保持稳定。 CodeQuery 还支持如基于代码数据的商业智能 (BI) 这类用例,能生成报告和仪表板,协助监控和决策过程。此外,CodeQuery 在分析大型语言模型 (LLM) 的训练数据方面发挥了重要作用,提供了增强这些模型整体效果的深入见解。 + 在当前的静态分析领域,CodeQuery 带来了一种新的范式。它不仅满足了大规模、复杂的代码库分析需求,还能适应不断变化和多元化的静态分析场景。CodeQuery 的以数据为中心的方法,使得其在处理大数据环境中的代码分析问题时具有独特优势。CodeQuery 的设计,旨在解决大规模软件开发环境中的静态分析问题。它能够将源代码和分析结果视作数据,使得其可以灵活地融入大型组织的各种系统中。这种方法不仅可以有效地处理大规模的代码库,还可以应对各种复杂的分析需求,从而使得静态分析工作变得更加高效和准确。 CodeQuery 的特点和优势可以概括为以下几点: diff --git a/doc/5_toolchain.md b/doc/5_toolchain.md index 6788c4e7..ef5cd4cc 100644 --- a/doc/5_toolchain.md +++ b/doc/5_toolchain.md @@ -79,5 +79,7 @@ code --install-extension [扩展vsix文件路径] - `godelScript.libraryDirectoryPath` - 用于指定 GödelScript 的库文件夹路径,默认为空。需要时请替换为 GödelScript 库文件夹绝对路径。 - 如果已经下载 Sparrow CLI ,则库文件夹路径为 `[sparrow cli root]/lib-1.0`。 -# 智能助手 + +# 智能助手 + 待开放,尽情期待! diff --git a/doc/Gemfile b/doc/Gemfile new file mode 100644 index 00000000..1db68334 --- /dev/null +++ b/doc/Gemfile @@ -0,0 +1,7 @@ +source 'https://rubygems.org' + +gem "jekyll", "~> 4.3.2" # installed by `gem jekyll` +# gem "webrick" # required when using Ruby >= 3 and Jekyll <= 4.2.2 + +gem "just-the-docs", "0.7.0" # pinned to the current release +# gem "just-the-docs" # always download the latest release diff --git a/doc/Gemfile.lock b/doc/Gemfile.lock new file mode 100644 index 00000000..2ded23cf --- /dev/null +++ b/doc/Gemfile.lock @@ -0,0 +1,86 @@ +GEM + remote: https://rubygems.org/ + specs: + addressable (2.8.5) + public_suffix (>= 2.0.2, < 6.0) + colorator (1.1.0) + concurrent-ruby (1.2.2) + em-websocket (0.5.3) + eventmachine (>= 0.12.9) + http_parser.rb (~> 0) + eventmachine (1.2.7) + ffi (1.15.5) + forwardable-extended (2.6.0) + google-protobuf (3.24.3-arm64-darwin) + google-protobuf (3.24.3-x86_64-linux) + http_parser.rb (0.8.0) + i18n (1.14.1) + concurrent-ruby (~> 1.0) + jekyll (4.3.2) + addressable (~> 2.4) + colorator (~> 1.0) + em-websocket (~> 0.5) + i18n (~> 1.0) + jekyll-sass-converter (>= 2.0, < 4.0) + jekyll-watch (~> 2.0) + kramdown (~> 2.3, >= 2.3.1) + kramdown-parser-gfm (~> 1.0) + liquid (~> 4.0) + mercenary (>= 0.3.6, < 0.5) + pathutil (~> 0.9) + rouge (>= 3.0, < 5.0) + safe_yaml (~> 1.0) + terminal-table (>= 1.8, < 4.0) + webrick (~> 1.7) + jekyll-include-cache (0.2.1) + jekyll (>= 3.7, < 5.0) + jekyll-sass-converter (3.0.0) + sass-embedded (~> 1.54) + jekyll-seo-tag (2.8.0) + jekyll (>= 3.8, < 5.0) + jekyll-watch (2.2.1) + listen (~> 3.0) + just-the-docs (0.7.0) + jekyll (>= 3.8.5) + jekyll-include-cache + jekyll-seo-tag (>= 2.0) + rake (>= 12.3.1) + kramdown (2.4.0) + rexml + kramdown-parser-gfm (1.1.0) + kramdown (~> 2.0) + liquid (4.0.4) + listen (3.8.0) + rb-fsevent (~> 0.10, >= 0.10.3) + rb-inotify (~> 0.9, >= 0.9.10) + mercenary (0.4.0) + pathutil (0.16.2) + forwardable-extended (~> 2.6) + public_suffix (5.0.3) + rake (13.0.6) + rb-fsevent (0.11.2) + rb-inotify (0.10.1) + ffi (~> 1.0) + rexml (3.2.6) + rouge (4.1.3) + safe_yaml (1.0.5) + sass-embedded (1.67.0-arm64-darwin) + google-protobuf (~> 3.23) + sass-embedded (1.67.0-x86_64-linux-gnu) + google-protobuf (~> 3.23) + terminal-table (3.0.2) + unicode-display_width (>= 1.1.1, < 3) + unicode-display_width (2.4.2) + webrick (1.8.1) + +PLATFORMS + arm64-darwin-21 + arm64-darwin-23 + x86_64-linux + +DEPENDENCIES + jekyll (~> 4.3.2) + just-the-docs (= 0.7.0) + +BUNDLED WITH + 2.3.26 diff --git a/doc/_config.yml b/doc/_config.yml new file mode 100644 index 00000000..3a619a3e --- /dev/null +++ b/doc/_config.yml @@ -0,0 +1,5 @@ +title: CodeFuse-Query Documentation +description: A starter template for a Jeykll site using the Just the Docs theme! +theme: just-the-docs + +url: https://codefuse-ai.github.io/CodeFuse-Query diff --git a/doc/index.md b/doc/index.md new file mode 100644 index 00000000..779bb822 --- /dev/null +++ b/doc/index.md @@ -0,0 +1,8 @@ +--- +title: Home +layout: default +nav_order: 1 +--- +## 文档 (Documentation) + +请见[仓库首页](https://github.com/codefuse-ai/CodeFuse-Query) diff --git a/doc/tools/build.py b/doc/tools/build.py new file mode 100644 index 00000000..d661ba8b --- /dev/null +++ b/doc/tools/build.py @@ -0,0 +1,21 @@ +import subprocess + +print("Download Sparrow CLI") +subprocess.run([ + "curl", + "-L", + "https://github.com/codefuse-ai/CodeFuse-Query/releases/download/2.0.2/sparrow-cli-2.0.2.linux.tar.gz", + "-o", + "sparrow-cli.tar.gz" +]) +subprocess.run([ + "tar", + "-xvzf", + "sparrow-cli.tar.gz" +]) +print("Copy ../assets into ./doc/assets") +subprocess.run(["cp", "-r", "../assets", "./"]) +print("Concat coref library from ../language into ./.coref-api-build") +subprocess.run(["python3", "tools/generate_coref_library.py", "../language"]) +print("Generate markdown documents into ./godel-api") +subprocess.run(["python3", "tools/generate_markdown.py", "./sparrow-cli/godel-script/usr/bin/godel"]) \ No newline at end of file diff --git a/doc/tools/generate_coref_library.py b/doc/tools/generate_coref_library.py new file mode 100644 index 00000000..1fd9ad5c --- /dev/null +++ b/doc/tools/generate_coref_library.py @@ -0,0 +1,28 @@ +import sys +import os + +if len(sys.argv) != 2: + print("Usage: python this_file.py language_library_directory") + exit(-1) + +input_language_dir = sys.argv[1] + +print("Generate library from", input_language_dir) +if not os.path.exists("./.coref-api-build"): + os.mkdir("./.coref-api-build") + +mapper = { + "coref.go.gdl": input_language_dir + "/go/lib", + "coref.java.gdl": input_language_dir + "/java/lib", + "coref.javascript.gdl": input_language_dir + "/javascript/lib", + "coref.python.gdl": input_language_dir + "/python/lib", + "coref.xml.gdl": input_language_dir + "/xml/lib", +} + +for key in mapper.keys(): + output_file = "./.coref-api-build/" + key + result = "" + for root, ignored, files in os.walk(mapper[key]): + for file in files: + result += open(root + "/" + file, "r").read() + "\n" + open(output_file, "w").write(result) \ No newline at end of file diff --git a/doc/tools/generate_markdown.py b/doc/tools/generate_markdown.py new file mode 100644 index 00000000..d54b3632 --- /dev/null +++ b/doc/tools/generate_markdown.py @@ -0,0 +1,310 @@ +import subprocess +import json +import os +import sys + +def delete_head_spaces(comment: str) -> str: + if comment.find("//") == 0: + return comment + result = "" + for string in comment.split("\n"): + temp = "" + is_space_flag = True + for ch in string: + if is_space_flag and (ch == ' ' or ch == '\t'): + continue + is_space_flag = False + temp += ch + result += temp + "\n" + if result[-1] == "\n": + return result[:-1] + return result + +def match_schema_comment(comment_list, schema) -> str: + schema_location = schema["location"] + for comment in comment_list: + comment_location = comment["location"] + if comment_location[3] < schema_location[1]-1: + continue + if comment_location[3] > schema_location[1]: + continue + return delete_head_spaces(comment["content"]) + return "" + +def match_comment(comment_list, function) -> str: + function_location = function["location"] + for comment in comment_list: + comment_location = comment["location"] + if comment_location[3] < function_location[1]-1: + continue + if comment_location[3] > function_location[1]: + continue + return delete_head_spaces(comment["content"]) + return "" + +def raw_string(name: str) -> str: + name = name.replace("_", "\\_") + name = name.replace("<", "\\<") + name = name.replace(">", "\\>") + return name + +def dump_type(type_struct): + result = "*" if type_struct["is_set"] == "true" else "" + result += type_struct["name"] + return result + +def dump_function(function): + result = "pub fn " + function["name"] + "(" + for param in function["parameter"]: + result += param["name"] + ": " + result += dump_type(param["type"]) + ", " + if result[-1] == " ": + result = result[:-2] + result += ") -> " + result += dump_type(function["return"]) + ";" + return result + +def dump_function_parameter(function, link_dir = "./schema/", link_db_path = "./database.html") -> str: + basic_type = ["int", "float", "string", "bool"] + result = "" + for param in function["parameter"]: + result += "* Parameter `" + raw_string(param["name"]) + "`: " + if param["type"]["name"] in basic_type: + result += "`" + dump_type(param["type"]) + "`\n" + elif param["type"]["name"] in database_map.keys(): + result += "[`" + dump_type(param["type"]) + "`](" + link_db_path + ")\n" + else: + result += "[`" + dump_type(param["type"]) + "`](" + link_dir + param["type"]["name"] + ".html)\n" + if function["return"]["name"] in basic_type: + result += "* Return `" + dump_type(function["return"]) + "`\n" + elif function["return"]["name"] in database_map.keys(): + result += "* Return [`" + dump_type(function["return"]) + "`](" + link_db_path + ")\n" + else: + result += "* Return [`" + dump_type(function["return"]) + "`](" + link_dir + function["return"]["name"] + ".html)\n" + return result + +database_map = {} +def dump_database(database) -> str: + database_map[database["name"]] = 1 + result = "## " + database["name"] + "\n\n" + for table in database["table"]: + result += "* " + table["name"] + ": " + result += "[*" + table["type"]["name"] + "](./schema/" + table["type"]["name"] + ".html)\n" + return result + +def dump_schema(comment_list, schema) -> str: + result = "---\n" + result += "layout: default\n" + result += "---\n\n" + result += "# " + schema["name"] + "\n\n" + comment_of_schema = match_schema_comment(comment_list, schema) + if len(comment_of_schema) > 0: + result += "```java\n" + comment_of_schema + "\n```\n" + if len(schema["parent"]) > 0: + result += "Inherit from [" + schema["parent"] + "](" + "./" + schema["parent"] + ".html)\n\n" + for field in schema["fields"]: + if field["primary"]=="true": + result += "Primary key: `" + field["name"] + ": " + result += dump_type(field["type"]) + "`\n\n" + break + result += "```typescript\n" + result += "schema " + schema["name"] + if len(schema["parent"]) > 0: + result += " extends " + schema["parent"] + result += " {\n" + for field in schema["fields"]: + result += " " + if field["primary"]=="true": + result += "@primary " + result += field["name"] + ": " + result += dump_type(field["type"]) + ",\n" + if result[-2:] == ",\n": + result = result[:-2] + "\n" + result += "}\n" + result += "```\n" + for method in schema["methods"]: + if method["is_public"]=="false": + continue + if method["name"] in ["is", "to", "key_eq", "key_neq", "to_set"]: + continue + result += "## " + schema["name"] + "::" + raw_string(method["name"]) + "\n\n" + if method["name"] == "__all__": + result += "Data constraint method.\n\n" + comment = match_comment(comment_list, method) + if len(comment) > 0: + result += "```java\n" + comment + "\n```\n" + result += dump_function_parameter(method, "./", "../database.html") + "\n" + result += "```rust\n" + result += dump_function(method) + "\n" + result += "```\n" + return result + +def dfs_visit_schema_hierarchy(schema, schema_list, indent: str) -> str: + result = indent + "* [" + schema["name"] + "](./schema/" + schema["name"] + ".html)\n" + for i in schema_list: + if i["parent"] == schema["name"]: + result += dfs_visit_schema_hierarchy(i, schema_list, indent + " ") + return result + +def dump_schema_tree_view(schema_list) -> str: + root_schema = [] + for schema in schema_list: + if len(schema["parent"]) == 0: + root_schema.append(schema) + result = "" + for i in root_schema: + result += dfs_visit_schema_hierarchy(i, schema_list, "") + return result + +if len(sys.argv) != 2: + print("Usage: python this_file.py godel_script_executable_path") + exit(-1) + +godel_compiler_path = sys.argv[1] +markdown_output_path = "./godel-api" +input_file_directory = "./.coref-api-build" + +dirs = [ + "./godel-api", + "./godel-api/cfamily", + "./godel-api/go", + "./godel-api/java", + "./godel-api/javascript", + "./godel-api/properties", + "./godel-api/python", + "./godel-api/sql", + "./godel-api/xml", + "./godel-api/cfamily/schema", + "./godel-api/go/schema", + "./godel-api/java/schema", + "./godel-api/javascript/schema", + "./godel-api/properties/schema", + "./godel-api/python/schema", + "./godel-api/sql/schema", + "./godel-api/xml/schema" +] +for d in dirs: + if not os.path.exists(d): + os.mkdir(d) + +input_file_list = [] +for (path, dirname, filename) in os.walk(input_file_directory): + for file in filename: + input_file_list.append({"path": path, "name": file}) + +name_mapper = { + "coref.cfamily.gdl": "cfamily", + "coref.go.gdl": "go", + "coref.java.gdl": "java", + "coref.javascript.gdl": "javascript", + "coref.properties.gdl": "properties", + "coref.python.gdl": "python", + "coref.sql.gdl": "sql", + "coref.xml.gdl": "xml" +} + +semantic_dict = {} +for file in input_file_list: + file_full_path = file["path"] + "/" + file["name"] + print("Extract semantic info from " + file_full_path) + result = subprocess.run( + [godel_compiler_path, "--dump-lsp", file_full_path], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL + ) + if result.returncode!=0: + continue + semantic_dict[file["name"]] = result.stdout.decode("utf-8") + +def dump_reference_main_doc(): + output_file_path = markdown_output_path + "/coref_library_reference.md" + output_data = "---\n" + output_data += "title: \"COREF Library Reference\"\n" + output_data += "layout: default\n" + output_data += "nav_order: 2\n" + output_data += "has_children: true\n" + output_data += "---\n\n" + output_data += "# COREF Library Reference\n\n" + for file in input_file_list: + output_data += "* [coref::" + name_mapper[file["name"]] + "]" + output_data += "(./" + name_mapper[file["name"]] + "/reference.html)\n" + open(output_file_path, "w").write(output_data) + +dump_reference_main_doc() + +for file in input_file_list: + file_full_path = file["path"] + "/" + file["name"] + print("Generate markdown for " + file_full_path) + semantic_info = json.loads(semantic_dict[file["name"]]) + comment_list = semantic_info["comments"] + + output_data = "---\n" + output_data += "title: \"coref::" + name_mapper[file["name"]] + "\"\n" + output_data += "layout: default\n" + output_data += "has_children: true\n" + output_data += "parent: \"COREF Library Reference\"\n" + output_data += "---\n" + output_data += "# COREF Library Reference for " + name_mapper[file["name"]] + "\n\n" + output_data += "* coref::" + name_mapper[file["name"]] + " [database](./database.html)\n" + output_data += "* coref::" + name_mapper[file["name"]] + " [function](./function.html)\n" + output_data += "* coref::" + name_mapper[file["name"]] + " [schema](./schema.html)\n" + output_file_path = markdown_output_path + "/" + name_mapper[file["name"]] + "/reference.md" + open(output_file_path, "w").write(output_data) + + output_data = "---\n" + output_data += "title: \"database\"\n" + output_data += "layout: default\n" + output_data += "parent: \"coref::" + name_mapper[file["name"]] + "\"\n" + output_data += "grand_parent: \"COREF Library Reference\"\n" + output_data += "---\n" + output_data += "# Database of " + file["name"] + "\n\n" + database_list = semantic_info["semantic"]["database"] + for database in database_list: + output_data += dump_database(database) + output_file_path = markdown_output_path + "/" + name_mapper[file["name"]] + "/database.md" + print("Generate", output_file_path) + open(output_file_path, "w").write(output_data) + + function_list = semantic_info["semantic"]["function"] + output_data = "---\n" + output_data += "title: \"function\"\n" + output_data += "layout: default\n" + output_data += "parent: \"coref::" + name_mapper[file["name"]] + "\"\n" + output_data += "grand_parent: \"COREF Library Reference\"\n" + output_data += "---\n" + output_data += "# Global Function of " + file["name"] + "\n\n" + for function in function_list: + if len(function["location"][0]) == 0: + continue + if function["is_public"]=="false": + continue + output_data += "## " + function["name"] + "\n\n" + comment = match_comment(comment_list, function) + if len(comment) > 0: + output_data += "```java\n" + comment + "\n```\n" + output_data += dump_function_parameter(function) + "\n" + output_data += "```rust\n" + output_data += dump_function(function) + "\n" + output_data += "```\n" + output_file_path = markdown_output_path + "/" + name_mapper[file["name"]] + "/function.md" + print("Generate", output_file_path) + open(output_file_path, "w").write(output_data) + + schema_list = semantic_info["semantic"]["schema"] + print("Generate schema documents for", file_full_path, ":", len(schema_list)) + for schema in schema_list: + output_data = dump_schema(comment_list, schema) + output_file_path = markdown_output_path + "/" + name_mapper[file["name"]] + "/schema/" + schema["name"] + ".md" + open(output_file_path, "w").write(output_data) + + output_data = "---\n" + output_data += "title: \"schema\"\n" + output_data += "layout: default\n" + output_data += "parent: \"coref::" + name_mapper[file["name"]] + "\"\n" + output_data += "grand_parent: \"COREF Library Reference\"\n" + output_data += "---\n" + output_data += "# Schema of " + file["name"] + "\n\n" + output_data += dump_schema_tree_view(schema_list) + output_file_path = markdown_output_path + "/" + name_mapper[file["name"]] + "/schema.md" + open(output_file_path, "w").write(output_data) + print("Generate schema documents for", file_full_path, ": Done") \ No newline at end of file