[LME-409] Add more report fields instead of LLM-response

Integrated a new field `chatDump` to capture and store the full chat session, including system context, messages, and metadata. This enhances data recording and presentation capabilities for evaluation processes. Response field now contains the last message from the chat Merge-request: IJ-MR-158449 Merged-by: Berkay Özerbay <berkay.ozerbay@jetbrains.com> (cherry picked from commit 95259fbf8d7b53e48c4fb47e6ec9316ca3e358a3) GitOrigin-RevId: 9ca54a94e759ce481979a1d2fe53f71634796905
2026-03-22 15:10:43 +07:00 · 2025-04-14 09:22:21 +00:00
parent defb0a50ea
commit 191d266208
2 changed files with 321 additions and 0 deletions
--- a/plugins/evaluation-plugin/core/src/com/intellij/cce/evaluable/AiaConstants.kt
+++ b/plugins/evaluation-plugin/core/src/com/intellij/cce/evaluable/AiaConstants.kt
@@ -1,14 +1,21 @@
 package com.intellij.cce.evaluable

 const val AIA_CONTEXT = "aia_context"
+const val AIA_SYSTEM_CONTEXT = "aia_system_context"
+const val AIA_CHAT_DUMP = "aia_chat_dump"
+const val AIA_CONTEXT_COLLECTION_DURATION_MS = "aia_context_collection_duration_ms"
 const val AIA_USER_PROMPT = "aia_user_prompt"
 const val AIA_RESPONSE = "aia_response"
+const val AIA_NAME: String = "aia_name"
+const val AIA_DESCRIPTION: String = "aia_description"
 const val AIA_ORIGINAL_CONTENT = "aia_original_content"
 const val AIA_FAILED_FILE_VALIDATIONS = "aia_failed_file_validations"
 const val AIA_FAILED_RELATED_FILE_VALIDATIONS = "aia_failed_related_file_validations"
 const val AIA_HAS_SYNTAX_ERRORS = "has_syntax_errors"
 const val AIA_HAS_HIGHLIGHT_ERRORS = "has_highlight_errors"
 const val AIA_PREDICTED_API_CALLS = "predicted_api_calls"
+const val AIA_EXPECTED_FUNCTION_CALLS = "expected_function_calls"
+const val AIA_ACTUAL_FUNCTION_CALLS = "actual_function_calls"
 const val AIA_GROUND_TRUTH_INTERNAL_API_CALLS = "ground_truth_internal_api_calls"
 const val AIA_ERASED_APIS = "erased_apis"
 const val AIA_HIGHLIGHT_ERRORS = "appeared_highlights"
@@ -17,3 +24,7 @@ const val AIA_TEST_LINE_COVERAGE = "test_line_coverage"
 const val AIA_TEST_BRANCH_COVERAGE = "test_branch_coverage"
 const val AIA_TEST_FILE_PROVIDED = "test_file_provided"
 const val LLM_JUDGE_RESPONSE = "llm_judge_response"
+const val AIA_HAS_NO_EFFECT: String = "has_no_effect"
+const val AIA_EXACT_MATCH: String = "exact_match"
+const val AIA_AST_MATCH: String = "ast_match"
+const val AIA_PROBLEMS: String = "aia_problems"
--- a/plugins/evaluation-plugin/core/src/com/intellij/cce/evaluation/data/EvalDataConstants.kt
+++ b/plugins/evaluation-plugin/core/src/com/intellij/cce/evaluation/data/EvalDataConstants.kt
@@ -0,0 +1,310 @@
+package com.intellij.cce.evaluation.data
+
+import com.intellij.cce.evaluable.*
+import com.intellij.cce.metric.*
+import com.intellij.cce.metric.context.MeanContextLines
+import com.intellij.cce.metric.context.MeanContextSize
+
+object Result {
+  val CURRENT_FILE_UPDATE: EvalDataDescription<String, TextUpdate> = EvalDataDescription(
+    name = "Current file update",
+    description = "Bind with the result content of the current file",
+    DataPlacement.CurrentFileUpdate,
+    presentation = EvalDataPresentation(
+      PresentationCategory.RESULT,
+      DataRenderer.TextDiff,
+      DynamicName.CurrentFileName,
+    )
+  )
+
+  val FILE_UPDATES: EvalDataDescription<List<FileUpdate>, FileUpdate> = EvalDataDescription(
+    name = "File updates",
+    description = "Bind with all updated files",
+    DataPlacement.FileUpdates("file_updates"),
+    presentation = EvalDataPresentation(
+      PresentationCategory.RESULT,
+      DataRenderer.TextDiff,
+      DynamicName.FileName,
+      ignoreMissingData = true
+    )
+  )
+
+  val EXPECTED_FILE_UPDATES: EvalDataDescription<List<FileUpdate>, FileUpdate> = EvalDataDescription(
+    name = "Expected file updates",
+    description = "Bind with all expected updated files",
+    DataPlacement.FileUpdates("expected_file_updates"),
+    presentation = EvalDataPresentation(
+      PresentationCategory.RESULT,
+      DataRenderer.TextDiff,
+      DynamicName.Formatted("Expected ", DynamicName.FileName),
+      ignoreMissingData = true
+    )
+  )
+}
+
+object Execution {
+  val LATENCY: TrivialEvalData<Long> = EvalDataDescription(
+    name = "Latency",
+    description = "Bind with millis spent for inference",
+    DataPlacement.Latency,
+    presentation = EvalDataPresentation(
+      PresentationCategory.EXECUTION,
+      DataRenderer.InlineLong
+    )
+  )
+
+  val USER_REQUEST: TrivialEvalData<String> = EvalDataDescription(
+    name = "User request",
+    description = "Request provided by user",
+    DataPlacement.AdditionalText(AIA_USER_PROMPT),
+    presentation = EvalDataPresentation(
+      PresentationCategory.EXECUTION,
+      DataRenderer.Text
+    )
+  )
+
+  val LLM_RESPONSE: TrivialEvalData<String> = EvalDataDescription(
+    name = "LLM response",
+    description = "LLM response",
+    DataPlacement.AdditionalText(AIA_RESPONSE),
+    presentation = EvalDataPresentation(
+      PresentationCategory.EXECUTION,
+      DataRenderer.Text
+    )
+  )
+
+  val LLM_CONTEXT: TrivialEvalData<String> = EvalDataDescription(
+    name = "LLM context",
+    description = "Result prompt used for LLM",
+    DataPlacement.AdditionalText(AIA_CONTEXT),
+    presentation = EvalDataPresentation(
+      PresentationCategory.EXECUTION,
+      DataRenderer.Text
+    )
+  )
+
+  val LLM_SYSTEM_CONTEXT: TrivialEvalData<String> = EvalDataDescription(
+    name = "LLM system context",
+    description = "Result system prompt used for LLM",
+    DataPlacement.AdditionalText(AIA_SYSTEM_CONTEXT),
+    presentation = EvalDataPresentation(
+      PresentationCategory.EXECUTION,
+      DataRenderer.Text
+    )
+  )
+
+  val LLM_CHAT_DUMP: TrivialEvalData<String> = EvalDataDescription(
+    name = "LLM chat dump",
+    description = "Full dump of the chat session including system context, messages, and metadata",
+    placement = DataPlacement.AdditionalText(AIA_CHAT_DUMP),
+    presentation = EvalDataPresentation(
+      PresentationCategory.EXECUTION,
+      DataRenderer.Text
+    )
+  )
+
+  val NAME: TrivialEvalData<String> = EvalDataDescription(
+    name = "Name",
+    description = "Some description of an evaluation case",
+    DataPlacement.AdditionalText(AIA_NAME),
+  )
+
+  val DESCRIPTION: TrivialEvalData<String> = EvalDataDescription(
+    name = "Preview",
+    description = "Some description of an evaluation case",
+    DataPlacement.AdditionalText(AIA_DESCRIPTION),
+  )
+}
+
+object Analysis {
+  val HAS_SYNTAX_ERRORS: TrivialEvalData<Boolean> = EvalDataDescription(
+    name = "Has syntax errors",
+    description = "Bind with `true` if the result has syntax errors",
+    DataPlacement.AdditionalBoolean(AIA_HAS_SYNTAX_ERRORS),
+    presentation = EvalDataPresentation(
+      PresentationCategory.ANALYSIS,
+      DataRenderer.InlineBoolean,
+    ),
+    problemIndicators = listOf(
+      ProblemIndicator.FromMetric { Metrics.WITHOUT_SYNTAX_ERRORS }
+    )
+  )
+
+  val HIGHLIGHT_ERRORS: TrivialEvalData<List<String>> = EvalDataDescription(
+    name = "Highlight errors and warnings",
+    description = "Bind with the list of appeared highlights in format `[ERROR] error_description` or `[WARNING] warning_description]`",
+    DataPlacement.AdditionalConcatenatedLines(AIA_HIGHLIGHT_ERRORS),
+    presentation = EvalDataPresentation(
+      PresentationCategory.ANALYSIS,
+      renderer = DataRenderer.Lines,
+    ),
+    problemIndicators = listOf(
+      ProblemIndicator.FromMetric { Metrics.WITHOUT_HIGHLIGHT_ERRORS }
+    )
+  )
+
+  val ERASED_APIS: TrivialEvalData<List<String>> = EvalDataDescription(
+    name = "Erased APIs",
+    description = "Bind with the list of erased API names",
+    DataPlacement.AdditionalConcatenatedLines(AIA_ERASED_APIS),
+    presentation = EvalDataPresentation(
+      PresentationCategory.ANALYSIS,
+      DataRenderer.Lines,
+    ),
+    problemIndicators = listOf(
+      ProblemIndicator.FromMetric { Metrics.PRESERVED_API }
+    )
+  )
+
+  val GROUND_TRUTH_API_CALLS: TrivialEvalData<List<String>> = EvalDataDescription(
+    name = "Ground truth internal API calls",
+    description = "Bind with the list of initial internal API calls",
+    DataPlacement.AdditionalConcatenatedLines(AIA_GROUND_TRUTH_INTERNAL_API_CALLS),
+    presentation = EvalDataPresentation(
+      PresentationCategory.ANALYSIS,
+      renderer = DataRenderer.Lines,
+    ),
+  )
+
+  val PREDICTED_API_CALLS: TrivialEvalData<List<String>> = EvalDataDescription(
+    name = "Predicted internal API calls",
+    description = "Bind with the list of predicted internal API calls",
+    DataPlacement.AdditionalConcatenatedLines(AIA_PREDICTED_API_CALLS),
+    presentation = EvalDataPresentation(
+      PresentationCategory.ANALYSIS,
+      renderer = DataRenderer.Lines,
+    ),
+  )
+
+  val FAILED_FILE_VALIDATIONS: TrivialEvalData<List<String>> = EvalDataDescription(
+    name = "Failed file validations",
+    description = "Bind with failed file validations",
+    placement = DataPlacement.AdditionalConcatenatedLines(AIA_FAILED_FILE_VALIDATIONS),
+    presentation = EvalDataPresentation(
+      PresentationCategory.ANALYSIS,
+      DataRenderer.Lines,
+    ),
+    problemIndicators = listOf(
+      ProblemIndicator.FromMetric { Metrics.FILE_VALIDATIONS_SUCCESS }
+    )
+  )
+
+  val FAILED_RELATED_FILE_VALIDATIONS: TrivialEvalData<List<String>> = EvalDataDescription(
+    name = "Failed related file validations",
+    description = "Bind with failed file validations in related files",
+    placement = DataPlacement.AdditionalConcatenatedLines(AIA_FAILED_RELATED_FILE_VALIDATIONS),
+    presentation = EvalDataPresentation(
+      PresentationCategory.ANALYSIS,
+      DataRenderer.Lines,
+    ),
+    problemIndicators = listOf(
+      ProblemIndicator.FromMetric { Metrics.FILE_VALIDATIONS_SUCCESS }
+    )
+  )
+
+  val HAS_NO_EFFECT: TrivialEvalData<Boolean> = EvalDataDescription(
+    name = "Has no effect",
+    description = "Bind with `true` if nothing has happened",
+    DataPlacement.AdditionalBoolean(AIA_HAS_NO_EFFECT),
+    presentation = EvalDataPresentation(
+      PresentationCategory.ANALYSIS,
+      DataRenderer.InlineBoolean,
+    ),
+    problemIndicators = listOf(
+      ProblemIndicator.FromValue { it }
+    )
+  )
+
+  val EXACT_MATCH: TrivialEvalData<Double> = EvalDataDescription(
+    name = "Exact match",
+    description = "Bind with `true` if result matches expected one",
+    DataPlacement.AdditionalDouble(AIA_EXACT_MATCH),
+    presentation = EvalDataPresentation(
+      PresentationCategory.ANALYSIS,
+      DataRenderer.InlineDouble,
+    ),
+    problemIndicators = listOf(
+      ProblemIndicator.FromMetric { Metrics.EXACT_MATCH }
+    )
+  )
+
+  val AST_MATCH: TrivialEvalData<Double> = EvalDataDescription(
+    name = "Ast match",
+    description = "Bind with `true` if result AST matches expected one",
+    DataPlacement.AdditionalDouble(AIA_AST_MATCH),
+    presentation = EvalDataPresentation(
+      PresentationCategory.ANALYSIS,
+      DataRenderer.InlineDouble,
+    ),
+    problemIndicators = listOf(
+      ProblemIndicator.FromMetric { Metrics.AST_MATCH }
+    )
+  )
+}
+
+object Metrics {
+  val SESSION_COUNT: EvalMetric = EvalMetric(
+    showInCard = false
+  ) { SessionsCountMetric() }
+
+  val PRECISION: EvalMetric = EvalMetric(
+    threshold = 1.0
+  ) { PrecisionMetric() }
+
+  val MEAN_LATENCY: EvalMetric = EvalMetric(
+    showInCard = false,
+    dependencies = MetricDependencies(Execution.LATENCY)
+  ) { MeanLatencyMetric() }
+
+  val MEAN_CONTEXT_SIZE: EvalMetric = EvalMetric(
+    dependencies = MetricDependencies(Execution.LLM_CONTEXT)
+  ) { MeanContextSize() }
+
+  val MEAN_CONTEXT_LINES: EvalMetric = EvalMetric(
+    dependencies = MetricDependencies(Execution.LLM_CONTEXT)
+  ) { MeanContextLines() }
+
+  val WITHOUT_SYNTAX_ERRORS: EvalMetric = EvalMetric(
+    threshold = 1.0,
+    dependencies = MetricDependencies(Analysis.HAS_SYNTAX_ERRORS)
+  ) { WithoutSyntaxErrorsSessionRatio() }
+
+  val WITHOUT_HIGHLIGHT_ERRORS: EvalMetric = EvalMetric(
+    threshold = 1.0,
+    dependencies = MetricDependencies(Analysis.HIGHLIGHT_ERRORS)
+  ) { WithoutHighlightErrorsSessionRatio() }
+
+  val PRESERVED_API: EvalMetric = EvalMetric(
+    threshold = 1.0,
+    dependencies = MetricDependencies(Analysis.ERASED_APIS)
+  ) { PreservedApi() }
+
+  val API_RECALL: EvalMetric = EvalMetric(
+    threshold = 1.0,
+    dependencies = MetricDependencies(
+      Analysis.GROUND_TRUTH_API_CALLS,
+      Analysis.PREDICTED_API_CALLS,
+      DataRenderer.TextDiff
+    ) { initial, result -> TextUpdate(initial.sorted().joinToString("\n"), result.sorted().joinToString("\n")) }
+  ) { ApiRecall() }
+
+  val FILE_VALIDATIONS_SUCCESS: EvalMetric = EvalMetric(
+    threshold = 1.0,
+    dependencies = MetricDependencies(Analysis.FAILED_FILE_VALIDATIONS)
+  ) { FileValidationSuccess() }
+
+  val RELATED_FILE_VALIDATIONS_SUCCESS: EvalMetric = EvalMetric(
+    threshold = 1.0,
+    dependencies = MetricDependencies(Analysis.FAILED_RELATED_FILE_VALIDATIONS)
+  ) { RelatedFileValidationSuccess() }
+
+  val EXACT_MATCH: EvalMetric = EvalMetric(
+    threshold = 1.0,
+    dependencies = MetricDependencies(Analysis.EXACT_MATCH)
+  ) { ExactMatchMetric() }
+
+  val AST_MATCH: EvalMetric = EvalMetric(
+    threshold = 1.0,
+    dependencies = MetricDependencies(Analysis.AST_MATCH)
+  ) { AstMatchMetric() }
+}