diff --git a/plugins/evaluation-plugin/core/src/com/intellij/cce/evaluable/AiaConstants.kt b/plugins/evaluation-plugin/core/src/com/intellij/cce/evaluable/AiaConstants.kt index f8731c93e4c6..fe1d31e9a7c0 100644 --- a/plugins/evaluation-plugin/core/src/com/intellij/cce/evaluable/AiaConstants.kt +++ b/plugins/evaluation-plugin/core/src/com/intellij/cce/evaluable/AiaConstants.kt @@ -1,14 +1,21 @@ package com.intellij.cce.evaluable const val AIA_CONTEXT = "aia_context" +const val AIA_SYSTEM_CONTEXT = "aia_system_context" +const val AIA_CHAT_DUMP = "aia_chat_dump" +const val AIA_CONTEXT_COLLECTION_DURATION_MS = "aia_context_collection_duration_ms" const val AIA_USER_PROMPT = "aia_user_prompt" const val AIA_RESPONSE = "aia_response" +const val AIA_NAME: String = "aia_name" +const val AIA_DESCRIPTION: String = "aia_description" const val AIA_ORIGINAL_CONTENT = "aia_original_content" const val AIA_FAILED_FILE_VALIDATIONS = "aia_failed_file_validations" const val AIA_FAILED_RELATED_FILE_VALIDATIONS = "aia_failed_related_file_validations" const val AIA_HAS_SYNTAX_ERRORS = "has_syntax_errors" const val AIA_HAS_HIGHLIGHT_ERRORS = "has_highlight_errors" const val AIA_PREDICTED_API_CALLS = "predicted_api_calls" +const val AIA_EXPECTED_FUNCTION_CALLS = "expected_function_calls" +const val AIA_ACTUAL_FUNCTION_CALLS = "actual_function_calls" const val AIA_GROUND_TRUTH_INTERNAL_API_CALLS = "ground_truth_internal_api_calls" const val AIA_ERASED_APIS = "erased_apis" const val AIA_HIGHLIGHT_ERRORS = "appeared_highlights" @@ -17,3 +24,7 @@ const val AIA_TEST_LINE_COVERAGE = "test_line_coverage" const val AIA_TEST_BRANCH_COVERAGE = "test_branch_coverage" const val AIA_TEST_FILE_PROVIDED = "test_file_provided" const val LLM_JUDGE_RESPONSE = "llm_judge_response" +const val AIA_HAS_NO_EFFECT: String = "has_no_effect" +const val AIA_EXACT_MATCH: String = "exact_match" +const val AIA_AST_MATCH: String = "ast_match" +const val AIA_PROBLEMS: String = "aia_problems" \ No newline at end of file diff --git a/plugins/evaluation-plugin/core/src/com/intellij/cce/evaluation/data/EvalDataConstants.kt b/plugins/evaluation-plugin/core/src/com/intellij/cce/evaluation/data/EvalDataConstants.kt new file mode 100644 index 000000000000..11d3bfc426ed --- /dev/null +++ b/plugins/evaluation-plugin/core/src/com/intellij/cce/evaluation/data/EvalDataConstants.kt @@ -0,0 +1,310 @@ +package com.intellij.cce.evaluation.data + +import com.intellij.cce.evaluable.* +import com.intellij.cce.metric.* +import com.intellij.cce.metric.context.MeanContextLines +import com.intellij.cce.metric.context.MeanContextSize + +object Result { + val CURRENT_FILE_UPDATE: EvalDataDescription = EvalDataDescription( + name = "Current file update", + description = "Bind with the result content of the current file", + DataPlacement.CurrentFileUpdate, + presentation = EvalDataPresentation( + PresentationCategory.RESULT, + DataRenderer.TextDiff, + DynamicName.CurrentFileName, + ) + ) + + val FILE_UPDATES: EvalDataDescription, FileUpdate> = EvalDataDescription( + name = "File updates", + description = "Bind with all updated files", + DataPlacement.FileUpdates("file_updates"), + presentation = EvalDataPresentation( + PresentationCategory.RESULT, + DataRenderer.TextDiff, + DynamicName.FileName, + ignoreMissingData = true + ) + ) + + val EXPECTED_FILE_UPDATES: EvalDataDescription, FileUpdate> = EvalDataDescription( + name = "Expected file updates", + description = "Bind with all expected updated files", + DataPlacement.FileUpdates("expected_file_updates"), + presentation = EvalDataPresentation( + PresentationCategory.RESULT, + DataRenderer.TextDiff, + DynamicName.Formatted("Expected ", DynamicName.FileName), + ignoreMissingData = true + ) + ) +} + +object Execution { + val LATENCY: TrivialEvalData = EvalDataDescription( + name = "Latency", + description = "Bind with millis spent for inference", + DataPlacement.Latency, + presentation = EvalDataPresentation( + PresentationCategory.EXECUTION, + DataRenderer.InlineLong + ) + ) + + val USER_REQUEST: TrivialEvalData = EvalDataDescription( + name = "User request", + description = "Request provided by user", + DataPlacement.AdditionalText(AIA_USER_PROMPT), + presentation = EvalDataPresentation( + PresentationCategory.EXECUTION, + DataRenderer.Text + ) + ) + + val LLM_RESPONSE: TrivialEvalData = EvalDataDescription( + name = "LLM response", + description = "LLM response", + DataPlacement.AdditionalText(AIA_RESPONSE), + presentation = EvalDataPresentation( + PresentationCategory.EXECUTION, + DataRenderer.Text + ) + ) + + val LLM_CONTEXT: TrivialEvalData = EvalDataDescription( + name = "LLM context", + description = "Result prompt used for LLM", + DataPlacement.AdditionalText(AIA_CONTEXT), + presentation = EvalDataPresentation( + PresentationCategory.EXECUTION, + DataRenderer.Text + ) + ) + + val LLM_SYSTEM_CONTEXT: TrivialEvalData = EvalDataDescription( + name = "LLM system context", + description = "Result system prompt used for LLM", + DataPlacement.AdditionalText(AIA_SYSTEM_CONTEXT), + presentation = EvalDataPresentation( + PresentationCategory.EXECUTION, + DataRenderer.Text + ) + ) + + val LLM_CHAT_DUMP: TrivialEvalData = EvalDataDescription( + name = "LLM chat dump", + description = "Full dump of the chat session including system context, messages, and metadata", + placement = DataPlacement.AdditionalText(AIA_CHAT_DUMP), + presentation = EvalDataPresentation( + PresentationCategory.EXECUTION, + DataRenderer.Text + ) + ) + + val NAME: TrivialEvalData = EvalDataDescription( + name = "Name", + description = "Some description of an evaluation case", + DataPlacement.AdditionalText(AIA_NAME), + ) + + val DESCRIPTION: TrivialEvalData = EvalDataDescription( + name = "Preview", + description = "Some description of an evaluation case", + DataPlacement.AdditionalText(AIA_DESCRIPTION), + ) +} + +object Analysis { + val HAS_SYNTAX_ERRORS: TrivialEvalData = EvalDataDescription( + name = "Has syntax errors", + description = "Bind with `true` if the result has syntax errors", + DataPlacement.AdditionalBoolean(AIA_HAS_SYNTAX_ERRORS), + presentation = EvalDataPresentation( + PresentationCategory.ANALYSIS, + DataRenderer.InlineBoolean, + ), + problemIndicators = listOf( + ProblemIndicator.FromMetric { Metrics.WITHOUT_SYNTAX_ERRORS } + ) + ) + + val HIGHLIGHT_ERRORS: TrivialEvalData> = EvalDataDescription( + name = "Highlight errors and warnings", + description = "Bind with the list of appeared highlights in format `[ERROR] error_description` or `[WARNING] warning_description]`", + DataPlacement.AdditionalConcatenatedLines(AIA_HIGHLIGHT_ERRORS), + presentation = EvalDataPresentation( + PresentationCategory.ANALYSIS, + renderer = DataRenderer.Lines, + ), + problemIndicators = listOf( + ProblemIndicator.FromMetric { Metrics.WITHOUT_HIGHLIGHT_ERRORS } + ) + ) + + val ERASED_APIS: TrivialEvalData> = EvalDataDescription( + name = "Erased APIs", + description = "Bind with the list of erased API names", + DataPlacement.AdditionalConcatenatedLines(AIA_ERASED_APIS), + presentation = EvalDataPresentation( + PresentationCategory.ANALYSIS, + DataRenderer.Lines, + ), + problemIndicators = listOf( + ProblemIndicator.FromMetric { Metrics.PRESERVED_API } + ) + ) + + val GROUND_TRUTH_API_CALLS: TrivialEvalData> = EvalDataDescription( + name = "Ground truth internal API calls", + description = "Bind with the list of initial internal API calls", + DataPlacement.AdditionalConcatenatedLines(AIA_GROUND_TRUTH_INTERNAL_API_CALLS), + presentation = EvalDataPresentation( + PresentationCategory.ANALYSIS, + renderer = DataRenderer.Lines, + ), + ) + + val PREDICTED_API_CALLS: TrivialEvalData> = EvalDataDescription( + name = "Predicted internal API calls", + description = "Bind with the list of predicted internal API calls", + DataPlacement.AdditionalConcatenatedLines(AIA_PREDICTED_API_CALLS), + presentation = EvalDataPresentation( + PresentationCategory.ANALYSIS, + renderer = DataRenderer.Lines, + ), + ) + + val FAILED_FILE_VALIDATIONS: TrivialEvalData> = EvalDataDescription( + name = "Failed file validations", + description = "Bind with failed file validations", + placement = DataPlacement.AdditionalConcatenatedLines(AIA_FAILED_FILE_VALIDATIONS), + presentation = EvalDataPresentation( + PresentationCategory.ANALYSIS, + DataRenderer.Lines, + ), + problemIndicators = listOf( + ProblemIndicator.FromMetric { Metrics.FILE_VALIDATIONS_SUCCESS } + ) + ) + + val FAILED_RELATED_FILE_VALIDATIONS: TrivialEvalData> = EvalDataDescription( + name = "Failed related file validations", + description = "Bind with failed file validations in related files", + placement = DataPlacement.AdditionalConcatenatedLines(AIA_FAILED_RELATED_FILE_VALIDATIONS), + presentation = EvalDataPresentation( + PresentationCategory.ANALYSIS, + DataRenderer.Lines, + ), + problemIndicators = listOf( + ProblemIndicator.FromMetric { Metrics.FILE_VALIDATIONS_SUCCESS } + ) + ) + + val HAS_NO_EFFECT: TrivialEvalData = EvalDataDescription( + name = "Has no effect", + description = "Bind with `true` if nothing has happened", + DataPlacement.AdditionalBoolean(AIA_HAS_NO_EFFECT), + presentation = EvalDataPresentation( + PresentationCategory.ANALYSIS, + DataRenderer.InlineBoolean, + ), + problemIndicators = listOf( + ProblemIndicator.FromValue { it } + ) + ) + + val EXACT_MATCH: TrivialEvalData = EvalDataDescription( + name = "Exact match", + description = "Bind with `true` if result matches expected one", + DataPlacement.AdditionalDouble(AIA_EXACT_MATCH), + presentation = EvalDataPresentation( + PresentationCategory.ANALYSIS, + DataRenderer.InlineDouble, + ), + problemIndicators = listOf( + ProblemIndicator.FromMetric { Metrics.EXACT_MATCH } + ) + ) + + val AST_MATCH: TrivialEvalData = EvalDataDescription( + name = "Ast match", + description = "Bind with `true` if result AST matches expected one", + DataPlacement.AdditionalDouble(AIA_AST_MATCH), + presentation = EvalDataPresentation( + PresentationCategory.ANALYSIS, + DataRenderer.InlineDouble, + ), + problemIndicators = listOf( + ProblemIndicator.FromMetric { Metrics.AST_MATCH } + ) + ) +} + +object Metrics { + val SESSION_COUNT: EvalMetric = EvalMetric( + showInCard = false + ) { SessionsCountMetric() } + + val PRECISION: EvalMetric = EvalMetric( + threshold = 1.0 + ) { PrecisionMetric() } + + val MEAN_LATENCY: EvalMetric = EvalMetric( + showInCard = false, + dependencies = MetricDependencies(Execution.LATENCY) + ) { MeanLatencyMetric() } + + val MEAN_CONTEXT_SIZE: EvalMetric = EvalMetric( + dependencies = MetricDependencies(Execution.LLM_CONTEXT) + ) { MeanContextSize() } + + val MEAN_CONTEXT_LINES: EvalMetric = EvalMetric( + dependencies = MetricDependencies(Execution.LLM_CONTEXT) + ) { MeanContextLines() } + + val WITHOUT_SYNTAX_ERRORS: EvalMetric = EvalMetric( + threshold = 1.0, + dependencies = MetricDependencies(Analysis.HAS_SYNTAX_ERRORS) + ) { WithoutSyntaxErrorsSessionRatio() } + + val WITHOUT_HIGHLIGHT_ERRORS: EvalMetric = EvalMetric( + threshold = 1.0, + dependencies = MetricDependencies(Analysis.HIGHLIGHT_ERRORS) + ) { WithoutHighlightErrorsSessionRatio() } + + val PRESERVED_API: EvalMetric = EvalMetric( + threshold = 1.0, + dependencies = MetricDependencies(Analysis.ERASED_APIS) + ) { PreservedApi() } + + val API_RECALL: EvalMetric = EvalMetric( + threshold = 1.0, + dependencies = MetricDependencies( + Analysis.GROUND_TRUTH_API_CALLS, + Analysis.PREDICTED_API_CALLS, + DataRenderer.TextDiff + ) { initial, result -> TextUpdate(initial.sorted().joinToString("\n"), result.sorted().joinToString("\n")) } + ) { ApiRecall() } + + val FILE_VALIDATIONS_SUCCESS: EvalMetric = EvalMetric( + threshold = 1.0, + dependencies = MetricDependencies(Analysis.FAILED_FILE_VALIDATIONS) + ) { FileValidationSuccess() } + + val RELATED_FILE_VALIDATIONS_SUCCESS: EvalMetric = EvalMetric( + threshold = 1.0, + dependencies = MetricDependencies(Analysis.FAILED_RELATED_FILE_VALIDATIONS) + ) { RelatedFileValidationSuccess() } + + val EXACT_MATCH: EvalMetric = EvalMetric( + threshold = 1.0, + dependencies = MetricDependencies(Analysis.EXACT_MATCH) + ) { ExactMatchMetric() } + + val AST_MATCH: EvalMetric = EvalMetric( + threshold = 1.0, + dependencies = MetricDependencies(Analysis.AST_MATCH) + ) { AstMatchMetric() } +} \ No newline at end of file