[aia] LLM-17290 add external api recall calculation, adapt chat code generation feature to a new report type

GitOrigin-RevId: 96f0f2fa5994a24b61d30a0824f447e35417d121
2026-01-08 15:09:39 +07:00 · 2025-06-04 10:42:56 +02:00
parent e8e0390d34
commit e2cc44b1ca
4 changed files with 190 additions and 17 deletions
--- a/plugins/evaluation-plugin/core/src/com/intellij/cce/evaluation/data/EvalDataConstants.kt
+++ b/plugins/evaluation-plugin/core/src/com/intellij/cce/evaluation/data/EvalDataConstants.kt
@@ -2,6 +2,8 @@ package com.intellij.cce.evaluation.data

 import com.intellij.cce.evaluable.*
 import com.intellij.cce.metric.*
+import com.intellij.cce.metric.ExternalApiRecall.Companion.AIA_GROUND_TRUTH_EXTERNAL_API_CALLS
+import com.intellij.cce.metric.ExternalApiRecall.Companion.AIA_PREDICTED_EXTERNAL_API_CALLS
 import com.intellij.cce.metric.context.MeanContextLines
 import com.intellij.cce.metric.context.MeanContextSize

@@ -194,6 +196,16 @@ object Analysis {
    ),
  )

+  val GROUND_TRUTH_EXTERNAL_API_CALLS: TrivialEvalData<List<String>> = EvalDataDescription(
+    name = "Ground truth external API calls",
+    description = "Bind with the list of initial external API calls",
+    DataPlacement.AdditionalConcatenatedLines(AIA_GROUND_TRUTH_EXTERNAL_API_CALLS),
+    presentation = EvalDataPresentation(
+      PresentationCategory.ANALYSIS,
+      renderer = DataRenderer.Lines,
+    )
+  )
+
  val PREDICTED_API_CALLS: TrivialEvalData<List<String>> = EvalDataDescription(
    name = "Predicted internal API calls",
    description = "Bind with the list of predicted internal API calls",
@@ -204,6 +216,16 @@ object Analysis {
    ),
  )

+  val PREDICTED_EXTERNAL_API_CALLS: TrivialEvalData<List<String>> = EvalDataDescription(
+    name = "Predicted external API calls",
+    description = "Bind with the list of predicted external API calls",
+    DataPlacement.AdditionalConcatenatedLines(AIA_PREDICTED_EXTERNAL_API_CALLS),
+    presentation = EvalDataPresentation(
+      PresentationCategory.ANALYSIS,
+      renderer = DataRenderer.Lines,
+    )
+  )
+
  val FAILED_FILE_VALIDATIONS: TrivialEvalData<List<String>> = EvalDataDescription(
    name = "Failed file validations",
    description = "Bind with failed file validations",
@@ -330,14 +352,23 @@ object Metrics {
    dependencies = MetricDependencies(Analysis.ERASED_APIS)
  ) { PreservedApi() }

-  val API_RECALL: EvalMetric = EvalMetric(
+  val INTERNAL_API_RECALL: EvalMetric = EvalMetric(
    threshold = 1.0,
    dependencies = MetricDependencies(
      Analysis.GROUND_TRUTH_API_CALLS,
      Analysis.PREDICTED_API_CALLS,
      DataRenderer.TextDiff
    ) { initial, result -> TextUpdate(initial.sorted().joinToString("\n"), result.sorted().joinToString("\n")) }
-  ) { ApiRecall() }
+  ) { InternalApiRecall() }
+
+  val EXTERNAL_API_RECALL: EvalMetric = EvalMetric(
+    threshold = 1.0,
+    dependencies = MetricDependencies(
+      Analysis.GROUND_TRUTH_EXTERNAL_API_CALLS,
+      Analysis.PREDICTED_EXTERNAL_API_CALLS,
+      DataRenderer.TextDiff
+    ) { initial, result -> TextUpdate(initial.sorted().joinToString("\n"), result.sorted().joinToString("\n")) }
+  ) { ExternalApiRecall() }

  val FILE_VALIDATIONS_SUCCESS: EvalMetric = EvalMetric(
    threshold = 1.0,
--- a/plugins/evaluation-plugin/core/src/com/intellij/cce/metric/ApiRecall.kt
+++ b/plugins/evaluation-plugin/core/src/com/intellij/cce/metric/ApiRecall.kt
@@ -6,22 +6,25 @@ import com.intellij.cce.evaluable.AIA_GROUND_TRUTH_INTERNAL_API_CALLS
 import com.intellij.cce.evaluable.AIA_PREDICTED_API_CALLS
 import com.intellij.cce.metric.util.Sample

-class ApiRecall : ConfidenceIntervalMetric<Double>() {
-  override val name: String = "API Recall"
-  override val description: String = "The fraction of correctly guessed project-defined API calls"
-  override val showByDefault: Boolean = true
-  override val valueType = MetricValueType.DOUBLE
-  override val value: Double
+abstract class ApiRecall : ConfidenceIntervalMetric<Double>() {
+  final override val showByDefault: Boolean = true
+  final override val valueType = MetricValueType.DOUBLE
+  final override val value: Double
    get() = compute(sample)

+  override val supportsIndividualScores: Boolean = true
+
+  abstract fun extractPredictedApiCallsFromLookup(lookup: Lookup): List<String>
+  abstract fun extractExpectedApiCallsFromLookup(lookup: Lookup): List<String>
+
  @Suppress("UNCHECKED_CAST")
-  override fun evaluate(sessions: List<Session>): Number {
+  final override fun evaluate(sessions: List<Session>): Number {
    val fileSample = Sample()
    sessions
      .flatMap { it.lookups }
      .forEach {
-        val predictedApiCalls = it.additionalList(AIA_PREDICTED_API_CALLS) ?: emptyList()
-        val groundTruthApiCalls = it.additionalList(AIA_GROUND_TRUTH_INTERNAL_API_CALLS) ?: emptyList()
+        val predictedApiCalls = extractPredictedApiCallsFromLookup(it)
+        val groundTruthApiCalls = extractExpectedApiCallsFromLookup(it)
        val apiRecall = calculateApiRecallForLookupSnippets(predictedApiCalls, groundTruthApiCalls)
        fileSample.add(apiRecall)
        coreSample.add(apiRecall)
@@ -29,7 +32,7 @@ class ApiRecall : ConfidenceIntervalMetric<Double>() {
    return fileSample.mean()
  }

-  override fun compute(sample: List<Double>): Double = sample.average()
+  final override fun compute(sample: List<Double>): Double = sample.average()

  private fun calculateApiRecallForLookupSnippets(
    predictedApiCalls: List<String>,
@@ -44,5 +47,36 @@ class ApiRecall : ConfidenceIntervalMetric<Double>() {
  }
 }

+class InternalApiRecall : ApiRecall() {
+  override val name: String = "API Recall"
+  override val description: String = "The fraction of correctly guessed project-defined API calls"
+
+  override fun extractPredictedApiCallsFromLookup(lookup: Lookup): List<String> {
+    return lookup.additionalList(AIA_PREDICTED_API_CALLS) ?: emptyList()
+  }
+
+  override fun extractExpectedApiCallsFromLookup(lookup: Lookup): List<String> {
+    return lookup.additionalList(AIA_GROUND_TRUTH_INTERNAL_API_CALLS) ?: emptyList()
+  }
+}
+
+class ExternalApiRecall : ApiRecall() {
+  override val name: String = "External API Recall"
+  override val description: String = "The fraction of correctly guessed library-defined API calls"
+
+  companion object {
+    const val AIA_PREDICTED_EXTERNAL_API_CALLS = "external_api_calls"
+    const val AIA_GROUND_TRUTH_EXTERNAL_API_CALLS = "external_api_calls_gt"
+  }
+
+  override fun extractPredictedApiCallsFromLookup(lookup: Lookup): List<String> {
+    return lookup.additionalList(AIA_PREDICTED_EXTERNAL_API_CALLS) ?: emptyList()
+  }
+
+  override fun extractExpectedApiCallsFromLookup(lookup: Lookup): List<String> {
+    return lookup.additionalList(AIA_GROUND_TRUTH_EXTERNAL_API_CALLS) ?: emptyList()
+  }
+}
+
 internal fun Lookup.additionalList(key: String): List<String>? =
-  additionalInfo[key]?.let { it as String }?.split("\n")?.filter { it.isNotEmpty() }
+  additionalInfo[key]?.let { it as String }?.split("\n")?.filter { it.isNotEmpty() }
--- a/plugins/evaluation-plugin/languages/java/src/com/intellij/cce/java/chat/JavaApiCallExtractor.kt
+++ b/plugins/evaluation-plugin/languages/java/src/com/intellij/cce/java/chat/JavaApiCallExtractor.kt
@@ -112,10 +112,8 @@ fun extractCalledExternalApiMethodsQualifiedNames(psiElement: PsiElement): List<
    val psiMethodCall = (it as? PsiMethodCallExpression) ?: return@forEach
    val referenceName = psiMethodCall.methodExpression.referenceName ?: return@forEach
    val method = it.resolveMethod()
-    if (method != null && (
-        isInternalApiMethod(method, psiElement) ||
-        isFromStandardLibrary(method)
-                          )) {
+    if (method != null && (isInternalApiMethod(method, psiElement) ||
+                           isFromStandardLibrary(method))) {
      return@forEach
    }
    externalApiMethodsQualifiedNames.add(referenceName)
--- a/plugins/evaluation-plugin/test/com/intellij/cce/metric/InternalApiRecallTest.kt
+++ b/plugins/evaluation-plugin/test/com/intellij/cce/metric/InternalApiRecallTest.kt
@@ -0,0 +1,110 @@
+package com.intellij.cce.metric
+
+import com.intellij.cce.core.Lookup
+import org.junit.jupiter.api.Assertions.assertEquals
+import org.junit.jupiter.api.Test
+
+class InternalApiRecallTest {
+
+  @Test
+  fun `extractPredictedApiCallsFromLookup should return list of predicted API calls when present`() {
+    val lookup = Lookup(
+      prefix = "test",
+      offset = 0,
+      suggestions = listOf(),
+      latency = 10L,
+      isNew = false,
+      additionalInfo = mapOf("predicted_api_calls" to "call1\ncall2\ncall3")
+    )
+    val apiRecall = InternalApiRecall()
+
+    val result = apiRecall.extractPredictedApiCallsFromLookup(lookup)
+
+    assertEquals(listOf("call1", "call2", "call3"), result)
+  }
+
+  @Test
+  fun `extractPredictedApiCallsFromLookup should return empty list when predicted API calls are absent`() {
+    val lookup = Lookup(
+      prefix = "test",
+      offset = 0,
+      suggestions = listOf(),
+      latency = 10L,
+      isNew = false,
+      additionalInfo = emptyMap()
+    )
+    val apiRecall = InternalApiRecall()
+
+    val result = apiRecall.extractPredictedApiCallsFromLookup(lookup)
+
+    assertEquals(emptyList<String>(), result)
+  }
+
+  @Test
+  fun `extractPredictedApiCallsFromLookup should return empty list when predicted API calls are empty`() {
+    val lookup = Lookup(
+      prefix = "test",
+      offset = 0,
+      suggestions = listOf(),
+      latency = 10L,
+      isNew = false,
+      additionalInfo = mapOf("predicted_api_calls" to "")
+    )
+    val apiRecall = InternalApiRecall()
+
+    val result = apiRecall.extractPredictedApiCallsFromLookup(lookup)
+
+    assertEquals(emptyList<String>(), result)
+  }
+
+  @Test
+  fun `extractExpectedApiCallsFromLookup should return list of expected API calls when present`() {
+    val lookup = Lookup(
+      prefix = "test",
+      offset = 0,
+      suggestions = listOf(),
+      latency = 10L,
+      isNew = false,
+      additionalInfo = mapOf("ground_truth_internal_api_calls" to "call1\ncall2\ncall3")
+    )
+    val apiRecall = InternalApiRecall()
+
+    val result = apiRecall.extractExpectedApiCallsFromLookup(lookup)
+
+    assertEquals(listOf("call1", "call2", "call3"), result)
+  }
+
+  @Test
+  fun `extractExpectedApiCallsFromLookup should return empty list when expected API calls are absent`() {
+    val lookup = Lookup(
+      prefix = "test",
+      offset = 0,
+      suggestions = listOf(),
+      latency = 10L,
+      isNew = false,
+      additionalInfo = emptyMap()
+    )
+    val apiRecall = InternalApiRecall()
+
+    val result = apiRecall.extractExpectedApiCallsFromLookup(lookup)
+
+    assertEquals(emptyList<String>(), result)
+  }
+
+  @Test
+  fun `extractExpectedApiCallsFromLookup should return empty list when expected API calls are empty`() {
+    val lookup = Lookup(
+      prefix = "test",
+      offset = 0,
+      suggestions = listOf(),
+      latency = 10L,
+      isNew = false,
+      additionalInfo = mapOf("ground_truth_internal_api_calls" to "")
+    )
+    val apiRecall = InternalApiRecall()
+
+    val result = apiRecall.extractExpectedApiCallsFromLookup(lookup)
+
+    assertEquals(emptyList<String>(), result)
+  }
+}