diff --git a/examples/muagent_examples/repochat/.env b/examples/muagent_examples/repochat/.env new file mode 100644 index 0000000..83daffc --- /dev/null +++ b/examples/muagent_examples/repochat/.env @@ -0,0 +1,3 @@ +github_token ="" +openai_api_key ="" +openai_base_url = "" \ No newline at end of file diff --git a/examples/muagent_examples/repochat/.env-tmp b/examples/muagent_examples/repochat/.env-tmp new file mode 100644 index 0000000..223f686 --- /dev/null +++ b/examples/muagent_examples/repochat/.env-tmp @@ -0,0 +1,6 @@ +github_token ="" +OPENAI_API_KEY = "" +API_BASE_URL = 'https://api.openai.com/v1/' +model_name = 'gpt-4o' +embed_model = 'text-embedding-ada-002' +model_engine = 'openai' diff --git a/examples/muagent_examples/repochat/README.md b/examples/muagent_examples/repochat/README.md new file mode 100644 index 0000000..f393744 --- /dev/null +++ b/examples/muagent_examples/repochat/README.md @@ -0,0 +1,8 @@ +muAgent-RepoChat + +## 需求 + 1. GitHub仓库克隆:用户提供GitHub仓库地址后,系统应自动克隆代码至指定的本地路径,便于后续分析。 + 2. 代码结构解析与信息提取:解析仓库结构,统计文件和文件夹数量,分析每个文件夹内容并生成描述。 + 3. 初始信息生成:基于文件结构,推测仓库的主要目的或功能,识别可能的启动文件及其路径,并提供如何启动仓库的指导。 + 4. 图数据库构建:将仓库内容以某种结构存入图数据库,以便后续的问答和查询。 + 5. 问答功能:基于初始化信息提供关于仓库的基本问题解答。 \ No newline at end of file diff --git a/examples/muagent_examples/repochat/README_zh.md b/examples/muagent_examples/repochat/README_zh.md new file mode 100644 index 0000000..e69de29 diff --git a/examples/muagent_examples/repochat/codebase/coderetrieval.py b/examples/muagent_examples/repochat/codebase/coderetrieval.py new file mode 100644 index 0000000..28f7090 --- /dev/null +++ b/examples/muagent_examples/repochat/codebase/coderetrieval.py @@ -0,0 +1,43 @@ +import os +from muagent.llm_models.llm_config import EmbedConfig, LLMConfig +from muagent.codechat.codebase_handler.codebase_handler import CodeBaseHandler + +from dotenv import load_dotenv + +from utils.tools import check_java_project +class CodeRetrieval: + def __init__(self,code_path,use_nh) -> None: + load_dotenv() + api_key = os.environ["OPENAI_API_KEY"] + api_base_url= os.environ["API_BASE_URL"] + model_name = os.environ["model_name"] + embed_model = os.environ["embed_model"] + model_engine = os.environ["model_engine"] + self.llm_config = LLMConfig( + model_name=model_name, model_engine=model_engine, api_key=api_key, api_base_url=api_base_url, temperature=0.3 + ) + self.embed_config = EmbedConfig( + embed_engine=model_engine, embed_model=embed_model, api_key=api_key, api_base_url=api_base_url) + if use_nh: + os.environ['nb_host'] = 'graphd' + os.environ['nb_port'] = '9669' + os.environ['nb_username'] = 'root' + os.environ['nb_password'] = 'nebula' + os.environ['nb_space'] = "client" + # 开始检查codepath是否存在 + if not os.path.exists(code_path): + raise Exception(f"code_path {code_path} not exists") + # 开始检查code_path这个是否是java项目 TODO:后面加其它语言 + check_java_project(code_path) + self.code_path = code_path + self.lang = "java" + self.use_nh = use_nh + self.CB_ROOT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "repobase") + os.makedirs(self.CB_ROOT_PATH, exist_ok=True) + + def init_codebase(self, codebase_name: str,do_interpret:bool = False): + self.cbh = CodeBaseHandler(codebase_name, self.code_path, crawl_type='dir', use_nh=self.use_nh, local_graph_path=self.CB_ROOT_PATH, + llm_config=self.llm_config, embed_config=self.embed_config,language=self.lang) + self.cbh.import_code(do_interpret=False) + def search_code(self, query,search_type="cypher",limit=10): + return self.cbh.search_code(query,search_type,limit=limit) \ No newline at end of file diff --git a/examples/muagent_examples/repochat/codebase/prompt.py b/examples/muagent_examples/repochat/codebase/prompt.py new file mode 100644 index 0000000..64e12e2 --- /dev/null +++ b/examples/muagent_examples/repochat/codebase/prompt.py @@ -0,0 +1,109 @@ +analyze_project_tree_prompt_add_prompt = """ +Input: +[项目目录架构] +{dictory_structure} +[用户issue] +{user_issue} +Output: + +""" + +analyze_files_project_tree_prompt = """ +你是一名代码架构专家,根据用户提供的issue,判断项目中哪个文件可能可以回答问题。 + +请按照以下JSON格式进行响应: +{ + "files": { + "thoughts": "用中文说明为何选择这些文件,如果没有确定的文件路径则留空。", + "file_path": ["如果确定需要修改的文件路径,一定要包含项目目录架构最外层的完整路径,请基于项目目录架构提供,最多5个"] + } +} +##NOTE: +要是路径一定要跟着项目目录架构,否则会出现问题。 +django/ + Gruntfile.js + scripts/ + manage_translations.py + rpm-install.sh + django/ + templatetags/ + l10n.py +比如想要找l10n.py这个文件一定要按照这样输出:'django/django/templatetags/l10n.py' + +规则: +- file_path 最多五个元素。 +- 不要输出其他信息,避免使用引号(例如`, \", \'等)。 +- 确保输出可以被Python的 `json.loads` 解析。 +- 不要使用markdown格式,例如```json或```,只需以相应的字符串格式输出。 +Input: +[项目目录架构] +django/ + Gruntfile.js + .git-blame-ignore-revs + INSTALL + LICENSE + CONTRIBUTING.rst + AUTHORS + .pre-commit-config.yaml + pyproject.toml + .eslintrc + MANIFEST.in + .readthedocs.yml + .editorconfig + LICENSE.python + setup.py + .gitignore + package.json + tox.ini + .gitattributes + setup.cfg + .eslintignore + README.rst + scripts/ + manage_translations.py + rpm-install.sh + django/ + shortcuts.py + __init__.py + __main__.py + templatetags/ + l10n.py + tz.py + cache.py + __init__.py + static.py + i18n.py + template/ + library.py + __init__.py + response.py + smartif.py + context_processors.py + defaultfilters.py + engine.py + context.py + utils.py + loader.py + loader_tags.py + exceptions.py + autoreload.py + base.py + defaulttags.py + + +[用户issue] +New template filter `escapeseq` +Description + +Following #34574, and after some conversations within the security team, it seems appropriate to provide a new template filter escapeseq which would be to escape what safeseq is to safe. An example of usage would be: +{{ some_list|escapeseq|join:"," }} +where each item of some_list is escaped before applying the join operation. This usage makes sense in a context where autoescape is off. + +Output: +{ + "files": { + "thoughts": "新的模板过滤器escapeseq会涉及到过滤器的具体实现文件。根据Django项目结构,这些过滤器通常定义在defaultfilters.py文件中。", + "file_path": ["django/django/template/defaultfilters.py"] + } +} +""" \ No newline at end of file diff --git a/examples/muagent_examples/repochat/requirements.txt b/examples/muagent_examples/repochat/requirements.txt new file mode 100644 index 0000000..06f1331 --- /dev/null +++ b/examples/muagent_examples/repochat/requirements.txt @@ -0,0 +1,4 @@ +python-dotenv +gitpython +codefuse-muagent +chainlit \ No newline at end of file diff --git a/examples/muagent_examples/repochat/tmp-java/Person.java b/examples/muagent_examples/repochat/tmp-java/Person.java new file mode 100644 index 0000000..b964150 --- /dev/null +++ b/examples/muagent_examples/repochat/tmp-java/Person.java @@ -0,0 +1,32 @@ +package com.example.bank; + +public class Person { + private String name; + private BankAccount account; + + public Person(String name) { + this.name = name; + this.account = new BankAccount(this); + } + + public String getName() { + return name; + } + + public BankAccount getAccount() { + return account; + } + + public void deposit(double amount) { + account.deposit(amount); + System.out.println(name + " deposited " + amount + " to their account."); + } + + public void withdraw(double amount) { + if (account.withdraw(amount)) { + System.out.println(name + " withdrew " + amount + " from their account."); + } else { + System.out.println(name + " does not have enough balance to withdraw " + amount + "."); + } + } +} diff --git a/examples/muagent_examples/repochat/utils/tools.py b/examples/muagent_examples/repochat/utils/tools.py new file mode 100644 index 0000000..062784d --- /dev/null +++ b/examples/muagent_examples/repochat/utils/tools.py @@ -0,0 +1,90 @@ +import os +import git +from dotenv import load_dotenv +import urllib.parse + + + +def get_directory_structure(directory_path:str, notallow:set=None): + """ + 获取指定目录下的文件结构并返回为字符串格式。 + + :param directory_path: str, 目录路径 + :param notallow: set, 不允许包含的文件或目录集合,默认值为None + :return: str, 文件结构 + """ + structure = [] + notallow_dict = {'.git', '__pycache__', '.idea','.github','.tx'} + + # 如果 notallow 参数不为空,将其合并到 notallow_dict 中 + if notallow: + notallow_dict.update(notallow) + for root, dirs, files in os.walk(directory_path): + # 过滤掉不需要的目录 + dirs[:] = [d for d in dirs if d not in notallow_dict] + + level = root.replace(directory_path, '').count(os.sep) + indent = ' ' * 4 * level + structure.append(f"{indent}{os.path.basename(root)}/") + sub_indent = ' ' * 4 * (level + 1) + for file in files: + structure.append(f"{sub_indent}{file}") + + return "\n".join(structure) + +# 开始检查 code_path 是否是 Java 项目 +def check_java_project(code_path): + # 检查是否存在 pom.xml 文件 + if not os.path.exists(os.path.join(code_path, "pom.xml")): + # 如果没有 pom.xml 文件,检查是否有 .java 文件 + has_java_file = False + for root, dirs, files in os.walk(code_path): + if any(file.endswith(".java") for file in files): + has_java_file = True + break + # 如果既没有 pom.xml 也没有 .java 文件,抛出异常 + if not has_java_file: + raise Exception(f"code_path {code_path} is not a Java project") + print(f"code_path {code_path} is a Java project") + +def clone_repo_with_token(repo_url, clone_to): + """ + 克隆一个需要认证的GitHub仓库。 + + 参数: + repo_url (str): 原始仓库的URL。 + clone_to (str): 克隆到的本地目录。 + + 返回: + str: 成功时返回克隆到的本地目录(包含子目录),不成功时返回空字符串。 + """ + try: + if not os.path.exists(clone_to): + os.makedirs(clone_to) + load_dotenv() + # 从环境变量中获取令牌 + token = os.getenv('github_token') + if not token: + raise ValueError("GitHub token not found in environment variables") + + # 提取仓库的域名和路径 + if repo_url.startswith("https://"): + repo_url = repo_url.replace("https://", f"https://{token}@") + elif repo_url.startswith("http://"): + repo_url = repo_url.replace("http://", f"http://{token}@") + + # 从URL中提取仓库名称 + repo_name = urllib.parse.urlparse(repo_url).path.split('/')[-1] + + # 在clone_to目录下创建新的目录 + cloned_path = os.path.join(clone_to, repo_name) + if os.path.exists(cloned_path): + return cloned_path + # 克隆仓库 + repo = git.Repo.clone_from(repo_url, cloned_path) + + print(f"Repository cloned to {cloned_path}") + return cloned_path + except Exception as e: + print(f"Failed to clone repository: {e}") + return '' \ No newline at end of file diff --git a/muagent/codechat/code_analyzer/code_static_analysis.py b/muagent/codechat/code_analyzer/code_static_analysis.py index 9c7c96d..a049d2a 100644 --- a/muagent/codechat/code_analyzer/code_static_analysis.py +++ b/muagent/codechat/code_analyzer/code_static_analysis.py @@ -7,6 +7,7 @@ ''' from muagent.codechat.code_analyzer.language_static_analysis import * + class CodeStaticAnalysis: def __init__(self, language): self.language = language @@ -19,6 +20,8 @@ def analyze(self, code_dict): ''' if self.language == 'java': analyzer = JavaStaticAnalysis() + elif self.language == 'python': + analyzer = PythonStaticAnalysis() else: raise ValueError('language should be one of [java]') diff --git a/muagent/codechat/code_analyzer/language_static_analysis/__init__.py b/muagent/codechat/code_analyzer/language_static_analysis/__init__.py index c99e049..3ab0f42 100644 --- a/muagent/codechat/code_analyzer/language_static_analysis/__init__.py +++ b/muagent/codechat/code_analyzer/language_static_analysis/__init__.py @@ -7,8 +7,8 @@ ''' from .java_static_analysis import JavaStaticAnalysis - +from .python_static_analysis import PythonStaticAnalysis __all__ = [ - 'JavaStaticAnalysis' + 'JavaStaticAnalysis','PythonStaticAnalysis' ] \ No newline at end of file diff --git a/muagent/codechat/code_analyzer/language_static_analysis/python_static_analysis.py b/muagent/codechat/code_analyzer/language_static_analysis/python_static_analysis.py new file mode 100644 index 0000000..bf806b4 --- /dev/null +++ b/muagent/codechat/code_analyzer/language_static_analysis/python_static_analysis.py @@ -0,0 +1,143 @@ +import ast +import os +import libcst as cst +import libcst.matchers as m +from libcst.display import dump +from loguru import logger +class GlobalVariableVisitor(cst.CSTVisitor): + METADATA_DEPENDENCIES = (cst.metadata.PositionProvider,) + + def __init__(self): + self.global_assigns = [] + + def leave_Module(self, original_node: cst.Module) -> list: + assigns = [] + for stmt in original_node.body: + if m.matches(stmt, m.SimpleStatementLine()) and m.matches( + stmt.body[0], m.Assign() + ): + start_pos = self.get_metadata(cst.metadata.PositionProvider, stmt).start + end_pos = self.get_metadata(cst.metadata.PositionProvider, stmt).end + assigns.append([stmt, start_pos, end_pos]) + self.global_assigns.extend(assigns) + + + +def parse_global_var_from_code(file_content: str) -> dict[str, dict]: + """Parse global variables.""" + try: + tree = cst.parse_module(file_content) + except: + return file_content + + wrapper = cst.metadata.MetadataWrapper(tree) + visitor = GlobalVariableVisitor() + wrapper.visit(visitor) + + global_assigns = {} + for assign_stmt, start_pos, end_pos in visitor.global_assigns: + for t in assign_stmt.body: + try: + targets = [t.targets[0].target.value] + except: + try: + targets = t.targets[0].target.elements + targets = [x.value.value for x in targets] + except: + targets = [] + for target_var in targets: + global_assigns[target_var] = { + "start_line": start_pos.line, + "end_line": end_pos.line, + } + return global_assigns + +def parse_python_file(file_path, file_content=None): + """解析一个Python文件,提取类和函数定义以及它们的行号。 + :param file_path: Python文件的路径。 + :return: 类名、函数名以及文件内容 + """ + if file_content is None: + try: + with open(file_path, "r") as file: + file_content = file.read() + parsed_data = ast.parse(file_content) + except Exception as e: # 捕获所有类型的异常 + print(f"文件 {file_path} 解析错误: {e}") + return [], [], "" + else: + try: + parsed_data = ast.parse(file_content) + except Exception as e: # 捕获所有类型的异常 + print(f"文件 {file_path} 解析错误: {e}") + return [], [], "" + + class_name_list = [] + func_name_dict = {} + import_pac_name_list = [] + global_var = parse_global_var_from_code(file_content) + for node in ast.walk(parsed_data): + if isinstance(node, ast.ClassDef): + class_name = node.name + class_name_list.append(class_name) + methods = [] + for n in node.body: + if isinstance(n, ast.FunctionDef): + func_name = f"{class_name}.{n.name}" + methods.append(func_name) + if class_name not in func_name_dict: + func_name_dict[class_name] = [] + func_name_dict[class_name].append(func_name) + elif isinstance(node, ast.FunctionDef) and not isinstance(node, ast.AsyncFunctionDef): + func_name = node.name + if func_name not in func_name_dict: + func_name_dict[func_name] = [] + func_name_dict[func_name].append(func_name) + elif isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom): + for alias in node.names: + import_pac_name_list.append(alias.name) + + fp_last = file_path.split(os.path.sep)[-1] + pac_name = f"{file_path}#{fp_last}" + + res = { + 'pac_name': pac_name, + 'class_name_list': class_name_list, + 'func_name_dict': func_name_dict, + 'import_pac_name_list': import_pac_name_list + } + return res + +class PythonStaticAnalysis: + def __init__(self): + pass + def analyze(self, python_code_dict): + ''' + parse python code and extract entity + ''' + res = {} + for fp, python_code in python_code_dict.items(): + tmp = parse_python_file(fp, python_code) + res[python_code] = tmp + return res + +if __name__ == '__main__': + python_code_dict = { + 'test': '''import unittest + +class UtilsTest(unittest.TestCase): + def test_remove_char(self): + input_str = "hello" + ch = 'l' + expected = "heo" + res = Utils.remove(input_str, ch) + self.assertEqual(res, expected) + +if __name__ == "__main__": + unittest.main() +''' + } + + psa = PythonStaticAnalysis() + res = psa.analyze(python_code_dict) + logger.info(res) \ No newline at end of file diff --git a/muagent/codechat/codebase_handler/codebase_handler.py b/muagent/codechat/codebase_handler/codebase_handler.py index d9bcf98..e98b364 100644 --- a/muagent/codechat/codebase_handler/codebase_handler.py +++ b/muagent/codechat/codebase_handler/codebase_handler.py @@ -133,6 +133,26 @@ def crawl_code(self, zip_file=''): ''' if self.language == 'java': suffix = 'java' + elif self.language == 'python': + suffix = 'py' + elif self.language == 'javascript': + suffix = 'js' + elif self.language == 'csharp': + suffix = 'cs' + elif self.language == 'cpp': + suffix = 'cpp' + elif self.language == 'ruby': + suffix = 'rb' + elif self.language == 'go': + suffix = 'go' + elif self.language == 'php': + suffix = 'php' + elif self.language == 'swift': + suffix = 'swift' + elif self.language == 'kotlin': + suffix = 'kt' + else: + raise ValueError(f"Unsupported language: {self.language}") logger.info(f'crawl_type={self.crawl_type}') diff --git a/requirements.txt b/requirements.txt index 84f0432..d2034aa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,4 +29,5 @@ urllib3==1.26.6 sseclient ollama colorama -pycryptodome \ No newline at end of file +pycryptodome +libcst \ No newline at end of file