Skip to content

Commit 5f5b6b5

Browse files
committed
feat(domain-dict): extract domain concepts using TreeSitter
Integrate TreeSitter-based code parsing to identify business entities from hot files, filtering out technical terms for more accurate DDD domain dictionary generation.
1 parent 8d08b65 commit 5f5b6b5

File tree

2 files changed

+236
-40
lines changed

2 files changed

+236
-40
lines changed

mpp-core/src/commonMain/kotlin/cc/unitmesh/agent/subagent/DomainDictAgent.kt

Lines changed: 231 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ import cc.unitmesh.agent.tool.impl.CodebaseInsightsTool
1313
import cc.unitmesh.agent.tool.impl.HotFileInfo
1414
import cc.unitmesh.agent.tool.schema.DeclarativeToolSchema
1515
import cc.unitmesh.agent.tool.schema.SchemaPropertyBuilder.string
16+
import cc.unitmesh.codegraph.model.CodeElementType
17+
import cc.unitmesh.codegraph.model.CodeNode
18+
import cc.unitmesh.codegraph.parser.CodeParser
19+
import cc.unitmesh.codegraph.parser.Language
1620
import cc.unitmesh.devins.filesystem.ProjectFileSystem
1721
import cc.unitmesh.devins.parser.CodeFence
1822
import cc.unitmesh.indexer.DomainDictService
@@ -68,22 +72,25 @@ data class DomainDictCallbacks(
6872
)
6973

7074
/**
71-
* DomainDictAgent - Simple, DDD-focused domain dictionary generator
75+
* DomainDictAgent - DDD-focused domain dictionary generator
7276
*
73-
* Design principles:
74-
* 1. Extract REAL data from codebase (class names, patterns)
75-
* 2. Filter and clean (remove generic terms, tests)
76-
* 3. Use AI ONLY for translation/description (with strict input)
77+
* Design Principles (DDD perspective):
78+
* 1. Extract REAL business entities from code (not technical infrastructure)
79+
* 2. Focus on HOT FILES (frequently changed = core business logic)
80+
* 3. Use TreeSitter to parse class/function names from important files
81+
* 4. Filter out technical suffixes (Controller, Service, Repository, etc.)
82+
* 5. AI only translates business concepts, NOT implementation details
7783
*
7884
* 3-Step Process:
79-
* 1. Analyze: Scan codebase for meaningful class/concept names
80-
* 2. Generate: Use AI to translate names to Chinese with descriptions
85+
* 1. Analyze: Scan Git history for hot files, use TreeSitter to extract class/function names
86+
* 2. Generate: Use AI with DDD principles to translate business concepts
8187
* 3. Save: Merge with existing dictionary
8288
*/
8389
class DomainDictAgent(
8490
private val llmService: KoogLLMService,
8591
private val fileSystem: ProjectFileSystem,
8692
private val domainDictService: DomainDictService,
93+
private val codeParser: CodeParser? = null,
8794
maxDefaultIterations: Int = 1,
8895
private val enableStreaming: Boolean = true
8996
) : SubAgent<DomainDictContext, ToolResult.AgentResult>(
@@ -271,28 +278,41 @@ class DomainDictAgent(
271278
return result
272279
}
273280

274-
private fun extractMeaningfulNames(
281+
/**
282+
* Extract meaningful names using TreeSitter parsing on hot files
283+
* Priority: Hot files (frequently changed) contain core business logic
284+
*/
285+
private suspend fun extractMeaningfulNames(
275286
insights: CodebaseInsightsResult,
276287
onProgress: (String) -> Unit
277288
): List<String> {
278289
val names = mutableSetOf<String>()
279290

280-
// 1. Extract from hot file names (most important)
291+
// 1. Use TreeSitter to parse hot files and extract class/function names
292+
if (codeParser != null) {
293+
onProgress(" 🌲 Using TreeSitter to parse hot files...")
294+
val hotFilesWithCode = parseHotFilesWithTreeSitter(insights.hotFiles, onProgress)
295+
names.addAll(hotFilesWithCode)
296+
}
297+
298+
// 2. Fallback: Extract from file names
281299
for (file in insights.hotFiles) {
282300
val fileName = file.path.substringAfterLast("/").substringBeforeLast(".")
283-
if (isValidDomainName(fileName)) {
284-
names.add(fileName)
301+
val domainName = extractDomainFromFileName(fileName)
302+
if (domainName != null && isValidDomainName(domainName)) {
303+
names.add(domainName)
285304
}
286305

287306
// Extract class name if available
288307
file.className?.let { className ->
289-
if (isValidDomainName(className)) {
290-
names.add(className)
308+
val extracted = extractDomainFromClassName(className)
309+
if (extracted != null && isValidDomainName(extracted)) {
310+
names.add(extracted)
291311
}
292312
}
293313
}
294314

295-
// 2. Extract from domain concepts (filtered)
315+
// 3. Extract from domain concepts (filtered)
296316
for (concept in insights.domainConcepts) {
297317
if (isValidDomainName(concept.name) && concept.occurrences >= 2) {
298318
names.add(concept.name)
@@ -302,14 +322,149 @@ class DomainDictAgent(
302322
return names.toList().sortedBy { it }
303323
}
304324

325+
/**
326+
* Parse hot files using TreeSitter to extract class and function names
327+
* These are the REAL important concepts in the codebase
328+
*/
329+
private suspend fun parseHotFilesWithTreeSitter(
330+
hotFiles: List<HotFileInfo>,
331+
onProgress: (String) -> Unit
332+
): Set<String> {
333+
val names = mutableSetOf<String>()
334+
val parser = codeParser ?: return names
335+
336+
// Take top 30 hot files for deep analysis
337+
val topHotFiles = hotFiles.take(30)
338+
var parsedCount = 0
339+
340+
for (file in topHotFiles) {
341+
val language = detectLanguage(file.path) ?: continue
342+
343+
try {
344+
val content = fileSystem.readFile(file.path) ?: continue
345+
val nodes = parser.parseNodes(content, file.path, language)
346+
347+
// Extract class names and function names
348+
for (node in nodes) {
349+
when (node.type) {
350+
CodeElementType.CLASS, CodeElementType.INTERFACE, CodeElementType.ENUM -> {
351+
val domainName = extractDomainFromClassName(node.name)
352+
if (domainName != null && isValidDomainName(domainName)) {
353+
names.add(domainName)
354+
}
355+
}
356+
CodeElementType.METHOD, CodeElementType.FUNCTION -> {
357+
// Extract domain concepts from method names
358+
val methodDomain = extractDomainFromMethodName(node.name)
359+
if (methodDomain != null && isValidDomainName(methodDomain)) {
360+
names.add(methodDomain)
361+
}
362+
}
363+
else -> {}
364+
}
365+
}
366+
parsedCount++
367+
} catch (e: Exception) {
368+
// Skip files that fail to parse
369+
}
370+
}
371+
372+
if (parsedCount > 0) {
373+
onProgress(" 📦 Parsed $parsedCount hot files, found ${names.size} domain concepts")
374+
}
375+
376+
return names
377+
}
378+
379+
/**
380+
* Detect programming language from file extension
381+
*/
382+
private fun detectLanguage(filePath: String): Language? {
383+
val ext = filePath.substringAfterLast(".", "").lowercase()
384+
return when (ext) {
385+
"java" -> Language.JAVA
386+
"kt", "kts" -> Language.KOTLIN
387+
"py" -> Language.PYTHON
388+
"js", "jsx" -> Language.JAVASCRIPT
389+
"ts", "tsx" -> Language.TYPESCRIPT
390+
"go" -> Language.GO
391+
"rs" -> Language.RUST
392+
else -> null
393+
}
394+
}
395+
396+
/**
397+
* Extract domain concept from file name (remove technical suffixes)
398+
* e.g., "DomainDictAgent" -> "DomainDict"
399+
*/
400+
private fun extractDomainFromFileName(fileName: String): String? {
401+
// Remove technical suffixes
402+
val suffixes = listOf(
403+
"Controller", "Service", "Repository", "Dao", "Mapper",
404+
"Impl", "Helper", "Utils", "Util", "Factory", "Builder",
405+
"Handler", "Listener", "Adapter", "Wrapper", "Provider",
406+
"Agent", "Tool", "Config", "Configuration", "Settings",
407+
"Test", "Spec", "Mock", "Fake", "Stub"
408+
)
409+
410+
var name = fileName
411+
for (suffix in suffixes) {
412+
if (name.endsWith(suffix) && name.length > suffix.length) {
413+
name = name.removeSuffix(suffix)
414+
break
415+
}
416+
}
417+
418+
return if (name.length >= 3) name else null
419+
}
420+
421+
/**
422+
* Extract domain concept from class name
423+
*/
424+
private fun extractDomainFromClassName(className: String): String? {
425+
return extractDomainFromFileName(className)
426+
}
427+
428+
/**
429+
* Extract domain concept from method name
430+
* e.g., "createBlogPost" -> "BlogPost"
431+
* e.g., "validatePayment" -> "Payment"
432+
*/
433+
private fun extractDomainFromMethodName(methodName: String): String? {
434+
// Skip common prefixes
435+
val prefixes = listOf(
436+
"get", "set", "is", "has", "can", "should", "will",
437+
"create", "update", "delete", "find", "fetch", "load",
438+
"save", "add", "remove", "build", "parse", "validate",
439+
"check", "process", "handle", "execute", "run", "init",
440+
"on", "to", "from"
441+
)
442+
443+
var name = methodName
444+
for (prefix in prefixes) {
445+
if (name.startsWith(prefix) && name.length > prefix.length) {
446+
val remainder = name.removePrefix(prefix)
447+
if (remainder.isNotEmpty() && remainder[0].isUpperCase()) {
448+
name = remainder
449+
break
450+
}
451+
}
452+
}
453+
454+
return if (name.length >= 4 && name[0].isUpperCase()) name else null
455+
}
456+
305457
/**
306458
* Check if a name is a valid domain concept (not a generic term)
459+
* Using DDD principles to filter out technical infrastructure
307460
*/
308461
private fun isValidDomainName(name: String): Boolean {
309462
if (name.length < 4) return false // Skip very short names
310463
if (name.length > 50) return false
311464

312-
// Skip generic/common terms
465+
val lowerName = name.lowercase()
466+
467+
// Skip generic/common terms (infrastructure, not domain)
313468
val skipTerms = setOf(
314469
// Testing
315470
"test", "tests", "spec", "mock", "stub", "fake",
@@ -335,25 +490,42 @@ class DomainDictAgent(
335490
"button", "text", "label", "field", "input", "output",
336491
"editor", "renderer", "painter", "drawer",
337492
"exception", "error", "warning", "message",
338-
"checks", "diff", "check"
493+
"checks", "diff", "check", "unknown"
339494
)
340495

341-
val lowerName = name.lowercase()
342-
343496
// Exact match skip
344497
if (lowerName in skipTerms) return false
345498

346-
// Skip IntelliJ platform concepts
499+
// Skip IntelliJ platform concepts (infrastructure)
347500
val platformTerms = setOf(
348501
"anaction", "applicationmanager", "project", "psifile", "psielement",
349502
"virtualfile", "document", "editor", "intention", "inspection",
350503
"psiclass", "psimethod", "psifield", "psitype", "psivariable",
351504
"language", "filetype", "module", "facet", "artifact",
352505
"toolwindow", "notification", "progress", "indicator",
353-
"runnable", "callable", "future", "promise", "deferred"
506+
"runnable", "callable", "future", "promise", "deferred",
507+
// JetBrains specific
508+
"jbcolor", "jbinsets", "jbui", "jbpopup", "jblist",
509+
// Java Swing/AWT
510+
"jcomponent", "jpanel", "jbutton", "jlabel", "jframe",
511+
"swing", "awt", "graphics"
354512
)
355513
if (platformTerms.any { lowerName.contains(it) }) return false
356514

515+
// Skip technical suffixes that indicate infrastructure
516+
val technicalSuffixes = setOf(
517+
"controller", "service", "repository", "dao", "mapper",
518+
"dto", "vo", "po", "entity", "request", "response",
519+
"config", "configuration", "settings", "properties",
520+
"handler", "listener", "callback", "adapter", "wrapper",
521+
"factory", "builder", "provider", "manager", "registry",
522+
"helper", "util", "utils", "tool", "tools",
523+
"impl", "implementation", "abstract", "base", "default",
524+
"exception", "error", "filter", "interceptor",
525+
"capable", "aware", "enabled", "disabled"
526+
)
527+
if (technicalSuffixes.any { lowerName.endsWith(it) }) return false
528+
357529
// Contains skip (for compound names like "TestHelper")
358530
val containsSkip = setOf("test", "spec", "mock", "fake", "stub", "factory", "util")
359531
if (containsSkip.any { lowerName.contains(it) }) return false
@@ -384,27 +556,48 @@ class DomainDictAgent(
384556

385557
val namesList = names.joinToString("\n") { "- $it" }
386558

559+
// DDD-focused prompt, inspired by indexer.vm
387560
val prompt = """
388-
你是一个技术文档翻译专家。请将以下代码中的类名/概念名翻译成简洁的中文术语。
389-
390-
## 要翻译的名称:
561+
你是一个 DDD(领域驱动设计)专家,负责构建业务导向的中英文词典。请从以下代码名称中提取重要的业务概念。
562+
563+
**提取原则:**
564+
565+
✅ 应该提取的内容:
566+
- 核心业务实体(如:Blog、Comment、Payment、User 等名词)
567+
- 业务概念和领域模型(如:Member、Points、Order)
568+
- 难以理解的词汇或拼音缩写
569+
- 领域特定术语
570+
571+
❌ 应该排除的内容:
572+
1. 技术词汇:Controller、Service、Repository、Mapper、DTO、VO、PO、Entity、Request、Response、Config 等
573+
2. 实现细节和数据传输对象:包含 "Request"、"Response"、"Dto"、"Entity" 后缀的条目
574+
3. 技术操作动词:validate、check、convert、deserialize、serialize、encode、decode 等
575+
4. 方法名中的技术操作:如 "checkIfVipAccount" 应只提取 "VIP Account"
576+
5. 通用库 API(如 Spring、OkHttp)和通用类名(如 List、Map)
577+
578+
**处理规则:**
579+
1. 如果提取的条目包含技术后缀(如 "CreateCommentDto"),转换为纯业务概念(如 "Comment")
580+
2. 如果方法名包含技术操作(如 "checkIfVipAccount"),提取业务含义("VIP Account")
581+
3. 如果类名包含技术词汇后缀,移除后缀再添加到词典
582+
583+
## 要分析的名称:
391584
$namesList
392-
585+
393586
## 输出格式 (JSON):
394-
```json
395-
{
396-
"entries": [
397-
{"chinese": "中文术语", "codeTranslation": "ClassName", "description": "一句话描述功能"}
398-
]
399-
}
400-
```
401-
402-
## 规则:
403-
1. chinese: 简洁的中文术语(2-6个字)
404-
2. codeTranslation: 保持原始类名
405-
3. description: 一句话描述(不超过30字)
406-
4. 只翻译有意义的领域概念
407-
5. 跳过无法理解或太通用的名称
587+
```json
588+
{
589+
"entries": [
590+
{"chinese": "博客", "codeTranslation": "Blog", "description": "博客文章"}
591+
]
592+
}
593+
```
594+
595+
## 输出规则:
596+
1. chinese: 简洁的中文术语2-6个字
597+
2. codeTranslation: 纯业务概念名(移除技术后缀)
598+
3. description: 一句话业务描述(不超过20字)
599+
4. 只输出有意义的业务概念,跳过技术实现细节
600+
5. 如果无法理解或太通用,直接跳过不输出
408601
409602
请直接输出JSON,不要其他解释。
410603
""".trimIndent()

mpp-ui/src/jvmMain/kotlin/cc/unitmesh/server/cli/DomainDictCli.kt

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package cc.unitmesh.server.cli
22

33
import cc.unitmesh.agent.subagent.DomainDictAgent
44
import cc.unitmesh.agent.subagent.DomainDictContext
5+
import cc.unitmesh.codegraph.parser.jvm.JvmCodeParser
56
import cc.unitmesh.devins.filesystem.DefaultFileSystem
67
import cc.unitmesh.indexer.DomainDictService
78
import cc.unitmesh.llm.KoogLLMService
@@ -98,12 +99,14 @@ object DomainDictCli {
9899
println("Current dictionary entries: $currentEntryCount")
99100
println()
100101

101-
// Create agent
102-
println("Creating DomainDictAgent...")
102+
// Create agent with TreeSitter parser for deep code analysis
103+
println("Creating DomainDictAgent with TreeSitter parser...")
104+
val codeParser = JvmCodeParser()
103105
val agent = DomainDictAgent(
104106
llmService = llmService,
105107
fileSystem = fileSystem,
106108
domainDictService = domainDictService,
109+
codeParser = codeParser,
107110
maxDefaultIterations = 7
108111
)
109112

0 commit comments

Comments
 (0)