Performance Optimization and Profiling: Advanced Techniques for Scala Applications

Performance optimization is crucial for building scalable, responsive Scala applications. This comprehensive lesson covers advanced profiling techniques, JVM tuning, memory management, algorithmic optimizations, and systematic approaches to identifying and resolving performance bottlenecks.

Understanding Performance Fundamentals

Performance Metrics and Measurement

import scala.util.Random
import java.util.concurrent.TimeUnit
import scala.concurrent.duration._
import java.lang.management.ManagementFactory

// Performance measurement utilities
object PerformanceMetrics {

  case class BenchmarkResult(
    operation: String,
    totalTime: Duration,
    iterations: Long,
    throughput: Double, // operations per second
    averageLatency: Duration,
    p95Latency: Duration,
    p99Latency: Duration,
    memoryUsed: Long
  )

  // Micro-benchmark framework
  def benchmark[T](
    name: String,
    warmupIterations: Int = 1000,
    measurementIterations: Int = 10000,
    operation: () => T
  ): BenchmarkResult = {

    // Warmup phase to trigger JIT compilation
    for (_ <- 1 to warmupIterations) {
      operation()
    }

    // Force garbage collection before measurement
    System.gc()
    Thread.sleep(100)

    val memoryBefore = getUsedMemory()
    val latencies = scala.collection.mutable.ArrayBuffer[Long]()

    val startTime = System.nanoTime()

    for (_ <- 1 to measurementIterations) {
      val operationStart = System.nanoTime()
      operation()
      val operationEnd = System.nanoTime()
      latencies += (operationEnd - operationStart)
    }

    val endTime = System.nanoTime()
    val totalNanos = endTime - startTime
    val memoryAfter = getUsedMemory()

    // Calculate statistics
    val sortedLatencies = latencies.sorted
    val totalTime = Duration(totalNanos, TimeUnit.NANOSECONDS)
    val throughput = measurementIterations.toDouble / (totalNanos.toDouble / 1e9)
    val averageLatency = Duration(latencies.sum / latencies.length, TimeUnit.NANOSECONDS)
    val p95Index = (latencies.length * 0.95).toInt
    val p99Index = (latencies.length * 0.99).toInt
    val p95Latency = Duration(sortedLatencies(p95Index), TimeUnit.NANOSECONDS)
    val p99Latency = Duration(sortedLatencies(p99Index), TimeUnit.NANOSECONDS)

    BenchmarkResult(
      operation = name,
      totalTime = totalTime,
      iterations = measurementIterations,
      throughput = throughput,
      averageLatency = averageLatency,
      p95Latency = p95Latency,
      p99Latency = p99Latency,
      memoryUsed = memoryAfter - memoryBefore
    )
  }

  // Multi-threaded benchmark
  def benchmarkConcurrent[T](
    name: String,
    threadCount: Int,
    operationsPerThread: Int,
    operation: () => T
  ): BenchmarkResult = {

    import java.util.concurrent.{Executors, CountDownLatch, ConcurrentLinkedQueue}

    val executor = Executors.newFixedThreadPool(threadCount)
    val latch = new CountDownLatch(threadCount)
    val latencies = new ConcurrentLinkedQueue[Long]()

    // Warmup
    for (_ <- 1 to 1000) operation()
    System.gc()
    Thread.sleep(100)

    val memoryBefore = getUsedMemory()
    val startTime = System.nanoTime()

    // Start concurrent threads
    for (_ <- 1 to threadCount) {
      executor.execute(() => {
        try {
          for (_ <- 1 to operationsPerThread) {
            val operationStart = System.nanoTime()
            operation()
            val operationEnd = System.nanoTime()
            latencies.add(operationEnd - operationStart)
          }
        } finally {
          latch.countDown()
        }
      })
    }

    latch.await()
    val endTime = System.nanoTime()
    val memoryAfter = getUsedMemory()

    executor.shutdown()

    // Calculate statistics
    val latencyList = latencies.asScala.toList.sorted
    val totalOperations = threadCount * operationsPerThread
    val totalNanos = endTime - startTime

    BenchmarkResult(
      operation = s"$name (${threadCount} threads)",
      totalTime = Duration(totalNanos, TimeUnit.NANOSECONDS),
      iterations = totalOperations,
      throughput = totalOperations.toDouble / (totalNanos.toDouble / 1e9),
      averageLatency = Duration(latencyList.sum / latencyList.length, TimeUnit.NANOSECONDS),
      p95Latency = Duration(latencyList((latencyList.length * 0.95).toInt), TimeUnit.NANOSECONDS),
      p99Latency = Duration(latencyList((latencyList.length * 0.99).toInt), TimeUnit.NANOSECONDS),
      memoryUsed = memoryAfter - memoryBefore
    )
  }

  private def getUsedMemory(): Long = {
    val runtime = Runtime.getRuntime
    runtime.totalMemory() - runtime.freeMemory()
  }

  // Memory allocation tracking
  def measureAllocations[T](operation: () => T): (T, Long) = {
    val memoryBefore = getUsedMemory()
    System.gc()
    Thread.sleep(50)
    val baselineMemory = getUsedMemory()

    val result = operation()

    System.gc()
    Thread.sleep(50)
    val finalMemory = getUsedMemory()

    val allocatedMemory = math.max(0, finalMemory - baselineMemory)
    (result, allocatedMemory)
  }

  // CPU profiling utilities
  def profileCpuUsage[T](operation: () => T): (T, Double) = {
    val bean = ManagementFactory.getThreadMXBean
    val threadId = Thread.currentThread().getId

    val startCpuTime = bean.getThreadCpuTime(threadId)
    val startWallTime = System.nanoTime()

    val result = operation()

    val endCpuTime = bean.getThreadCpuTime(threadId)
    val endWallTime = System.nanoTime()

    val cpuTime = endCpuTime - startCpuTime
    val wallTime = endWallTime - startWallTime
    val cpuUtilization = cpuTime.toDouble / wallTime.toDouble

    (result, cpuUtilization)
  }
}

// JVM and GC monitoring
object JVMMetrics {

  case class GCStats(
    gcName: String,
    collectionCount: Long,
    collectionTime: Long,
    memoryPoolsBefore: Map[String, Long],
    memoryPoolsAfter: Map[String, Long]
  )

  case class MemoryStats(
    heapUsed: Long,
    heapMax: Long,
    nonHeapUsed: Long,
    nonHeapMax: Long,
    directMemory: Long,
    pools: Map[String, (Long, Long)] // (used, max)
  )

  def getCurrentMemoryStats(): MemoryStats = {
    val heapMemory = ManagementFactory.getMemoryMXBean.getHeapMemoryUsage
    val nonHeapMemory = ManagementFactory.getMemoryMXBean.getNonHeapMemoryUsage
    val memoryPools = ManagementFactory.getMemoryPoolMXBeans.asScala

    val pools = memoryPools.map { pool =>
      val usage = pool.getUsage
      pool.getName -> (usage.getUsed, usage.getMax)
    }.toMap

    // Estimate direct memory (this is approximate)
    val directMemory = try {
      val directBufferBean = ManagementFactory.getPlatformMXBean(classOf[com.sun.management.OperatingSystemMXBean])
      // Direct memory estimation - actual implementation would be more complex
      0L
    } catch {
      case _: Exception => 0L
    }

    MemoryStats(
      heapUsed = heapMemory.getUsed,
      heapMax = heapMemory.getMax,
      nonHeapUsed = nonHeapMemory.getUsed,
      nonHeapMax = nonHeapMemory.getMax,
      directMemory = directMemory,
      pools = pools
    )
  }

  def getGCStats(): List[GCStats] = {
    import scala.jdk.CollectionConverters._

    ManagementFactory.getGarbageCollectorMXBeans.asScala.map { gc =>
      GCStats(
        gcName = gc.getName,
        collectionCount = gc.getCollectionCount,
        collectionTime = gc.getCollectionTime,
        memoryPoolsBefore = Map.empty, // Would need more complex tracking
        memoryPoolsAfter = Map.empty
      )
    }.toList
  }

  // Monitor GC impact
  def monitorGC[T](operation: () => T): (T, List[GCStats], Duration) = {
    val gcBefore = getGCStats()
    val startTime = System.nanoTime()

    val result = operation()

    val endTime = System.nanoTime()
    val gcAfter = getGCStats()

    val gcDelta = gcBefore.zip(gcAfter).map { case (before, after) =>
      after.copy(
        collectionCount = after.collectionCount - before.collectionCount,
        collectionTime = after.collectionTime - before.collectionTime
      )
    }

    val totalTime = Duration(endTime - startTime, TimeUnit.NANOSECONDS)
    (result, gcDelta, totalTime)
  }
}

Algorithm and Data Structure Optimization

// Optimized collection operations
object OptimizedCollections {

  // Efficient bulk operations
  def efficientGroupBy[T, K](items: List[T], keyFunc: T => K): Map[K, List[T]] = {
    val result = scala.collection.mutable.Map[K, scala.collection.mutable.ListBuffer[T]]()

    items.foreach { item =>
      val key = keyFunc(item)
      result.getOrElseUpdate(key, scala.collection.mutable.ListBuffer.empty) += item
    }

    result.view.mapValues(_.toList).toMap
  }

  // Memory-efficient filtering and mapping
  def filterMap[T, U](items: List[T])(f: T => Option[U]): List[U] = {
    val result = scala.collection.mutable.ListBuffer[U]()
    items.foreach { item =>
      f(item).foreach(result += _)
    }
    result.toList
  }

  // Chunked processing for large datasets
  def processInChunks[T, U](
    items: Iterator[T],
    chunkSize: Int,
    processor: List[T] => List[U]
  ): Iterator[U] = {
    items.grouped(chunkSize).flatMap(chunk => processor(chunk.toList))
  }

  // Lazy evaluation with memoization
  class MemoizedFunction[A, B](f: A => B) {
    private val cache = scala.collection.mutable.Map[A, B]()

    def apply(x: A): B = cache.getOrElseUpdate(x, f(x))

    def clearCache(): Unit = cache.clear()
    def cacheSize: Int = cache.size
  }

  // Efficient set operations
  object FastSetOps {

    // Bit set for integers
    class BitSetInt(maxValue: Int) {
      private val bits = new Array[Long]((maxValue + 63) / 64)

      def add(value: Int): Unit = {
        val wordIndex = value / 64
        val bitIndex = value % 64
        bits(wordIndex) |= (1L << bitIndex)
      }

      def contains(value: Int): Boolean = {
        val wordIndex = value / 64
        val bitIndex = value % 64
        (bits(wordIndex) & (1L << bitIndex)) != 0
      }

      def remove(value: Int): Unit = {
        val wordIndex = value / 64
        val bitIndex = value % 64
        bits(wordIndex) &= ~(1L << bitIndex)
      }

      def size: Int = bits.map(java.lang.Long.bitCount).sum
    }

    // Bloom filter for approximate membership testing
    class BloomFilter[T](capacity: Int, errorRate: Double = 0.01)(hash: T => Int) {
      private val bitArraySize = (-capacity * math.log(errorRate) / (math.log(2) * math.log(2))).toInt
      private val hashFunctions = (-math.log(errorRate) / math.log(2)).toInt
      private val bitArray = new Array[Boolean](bitArraySize)

      def add(item: T): Unit = {
        val baseHash = hash(item)
        for (i <- 0 until hashFunctions) {
          val hashValue = math.abs((baseHash + i * 37) % bitArraySize)
          bitArray(hashValue) = true
        }
      }

      def mightContain(item: T): Boolean = {
        val baseHash = hash(item)
        (0 until hashFunctions).forall { i =>
          val hashValue = math.abs((baseHash + i * 37) % bitArraySize)
          bitArray(hashValue)
        }
      }
    }
  }

  // High-performance string operations
  object StringOptimizations {

    // StringBuilder with initial capacity
    def efficientStringBuilder(expectedLength: Int): StringBuilder = 
      new StringBuilder(expectedLength)

    // String interning for repeated strings
    private val internMap = scala.collection.mutable.Map[String, String]()

    def intern(s: String): String = internMap.getOrElseUpdate(s, s)

    // Fast string searching using KMP algorithm
    def kmpSearch(text: String, pattern: String): List[Int] = {
      if (pattern.isEmpty) return List.empty

      // Build failure function
      val failure = Array.ofDim[Int](pattern.length)
      var j = 0
      for (i <- 1 until pattern.length) {
        while (j > 0 && pattern(i) != pattern(j)) {
          j = failure(j - 1)
        }
        if (pattern(i) == pattern(j)) {
          j += 1
        }
        failure(i) = j
      }

      // Search for pattern
      val matches = scala.collection.mutable.ListBuffer[Int]()
      j = 0
      for (i <- text.indices) {
        while (j > 0 && text(i) != pattern(j)) {
          j = failure(j - 1)
        }
        if (text(i) == pattern(j)) {
          j += 1
        }
        if (j == pattern.length) {
          matches += (i - j + 1)
          j = failure(j - 1)
        }
      }

      matches.toList
    }
  }
}

// Numerical computation optimizations
object NumericalOptimizations {

  // Fast mathematical operations
  object FastMath {

    // Fast integer square root
    def fastSqrt(x: Int): Int = {
      if (x == 0) return 0
      var result = x
      var last = (x + 1) / 2
      while (last < result) {
        result = last
        last = (result + x / result) / 2
      }
      result
    }

    // Fast power calculation using exponentiation by squaring
    def fastPow(base: Long, exponent: Int): Long = {
      if (exponent == 0) return 1L
      if (exponent == 1) return base

      val half = fastPow(base, exponent / 2)
      if (exponent % 2 == 0) {
        half * half
      } else {
        half * half * base
      }
    }

    // Bit manipulation utilities
    def isPowerOfTwo(n: Int): Boolean = n > 0 && (n & (n - 1)) == 0

    def nextPowerOfTwo(n: Int): Int = {
      var power = 1
      while (power < n) power <<= 1
      power
    }

    def countSetBits(n: Int): Int = java.lang.Integer.bitCount(n)

    // Fast modular arithmetic
    def fastMod(dividend: Int, divisor: Int): Int = {
      if (isPowerOfTwo(divisor)) {
        dividend & (divisor - 1)
      } else {
        dividend % divisor
      }
    }
  }

  // Specialized numeric collections
  class IntArray(size: Int) {
    private val array = new Array[Int](size)
    private var length = 0

    def add(value: Int): Unit = {
      if (length >= array.length) {
        throw new IndexOutOfBoundsException("Array is full")
      }
      array(length) = value
      length += 1
    }

    def get(index: Int): Int = {
      if (index >= length) throw new IndexOutOfBoundsException()
      array(index)
    }

    def size: Int = length

    def sum: Long = {
      var total = 0L
      var i = 0
      while (i < length) {
        total += array(i)
        i += 1
      }
      total
    }

    def max: Option[Int] = {
      if (length == 0) None
      else {
        var maximum = array(0)
        var i = 1
        while (i < length) {
          if (array(i) > maximum) maximum = array(i)
          i += 1
        }
        Some(maximum)
      }
    }

    def sortInPlace(): Unit = {
      java.util.Arrays.sort(array, 0, length)
    }
  }

  // Matrix operations with memory optimization
  class DenseMatrix(rows: Int, cols: Int) {
    private val data = new Array[Double](rows * cols)

    def apply(row: Int, col: Int): Double = {
      data(row * cols + col)
    }

    def update(row: Int, col: Int, value: Double): Unit = {
      data(row * cols + col) = value
    }

    def multiply(other: DenseMatrix): DenseMatrix = {
      require(cols == other.rows, "Matrix dimensions don't match")

      val result = new DenseMatrix(rows, other.cols)

      // Cache-friendly multiplication
      for (i <- 0 until rows; k <- 0 until cols; j <- 0 until other.cols) {
        result.data(i * other.cols + j) += data(i * cols + k) * other.data(k * other.cols + j)
      }

      result
    }

    def transpose: DenseMatrix = {
      val result = new DenseMatrix(cols, rows)
      for (i <- 0 until rows; j <- 0 until cols) {
        result(j, i) = this(i, j)
      }
      result
    }
  }
}

Memory Management and GC Optimization

Understanding Memory Allocation Patterns

// Memory-efficient programming patterns
object MemoryOptimizations {

  // Object pooling for frequently created objects
  class ObjectPool[T](factory: () => T, reset: T => Unit, maxSize: Int = 100) {
    private val pool = scala.collection.mutable.Queue[T]()
    private var created = 0

    def borrow(): T = {
      pool.synchronized {
        if (pool.nonEmpty) {
          pool.dequeue()
        } else {
          created += 1
          factory()
        }
      }
    }

    def return_(obj: T): Unit = {
      pool.synchronized {
        if (pool.size < maxSize) {
          reset(obj)
          pool.enqueue(obj)
        }
      }
    }

    def stats: (Int, Int) = pool.synchronized((created, pool.size))
  }

  // StringBuilder pool for string operations
  val stringBuilderPool = new ObjectPool[StringBuilder](
    factory = () => new StringBuilder(256),
    reset = _.clear(),
    maxSize = 50
  )

  def withStringBuilder[T](f: StringBuilder => T): T = {
    val sb = stringBuilderPool.borrow()
    try {
      f(sb)
    } finally {
      stringBuilderPool.return_(sb)
    }
  }

  // Off-heap data structures using DirectByteBuffer
  class OffHeapIntArray(size: Int) {
    private val buffer = java.nio.ByteBuffer.allocateDirect(size * 4)

    def set(index: Int, value: Int): Unit = {
      buffer.putInt(index * 4, value)
    }

    def get(index: Int): Int = {
      buffer.getInt(index * 4)
    }

    def size: Int = buffer.capacity() / 4

    def close(): Unit = {
      // DirectByteBuffer cleanup (Java 9+)
      try {
        val cleanerMethod = buffer.getClass.getMethod("cleaner")
        val cleaner = cleanerMethod.invoke(buffer)
        val cleanMethod = cleaner.getClass.getMethod("clean")
        cleanMethod.invoke(cleaner)
      } catch {
        case _: Exception => // Ignore cleanup errors
      }
    }
  }

  // Memory-mapped file for large datasets
  class MemoryMappedArray(file: java.io.File, size: Long) {
    private val channel = java.nio.channels.FileChannel.open(
      file.toPath,
      java.nio.file.StandardOpenOption.CREATE,
      java.nio.file.StandardOpenOption.READ,
      java.nio.file.StandardOpenOption.WRITE
    )

    private val buffer = channel.map(
      java.nio.channels.FileChannel.MapMode.READ_WRITE,
      0,
      size
    )

    def putLong(index: Long, value: Long): Unit = {
      buffer.putLong((index * 8).toInt, value)
    }

    def getLong(index: Long): Long = {
      buffer.getLong((index * 8).toInt)
    }

    def close(): Unit = {
      channel.close()
    }
  }

  // Flyweight pattern for immutable objects
  class FlyweightFactory[T] {
    private val cache = scala.collection.mutable.Map[String, T]()

    def getInstance(key: String, factory: => T): T = {
      cache.getOrElseUpdate(key, factory)
    }

    def cacheSize: Int = cache.size
    def clearCache(): Unit = cache.clear()
  }

  // Reference management utilities
  object ReferenceUtils {
    import java.lang.ref.{WeakReference, SoftReference, PhantomReference, ReferenceQueue}

    // Weak reference cache
    class WeakCache[K, V] {
      private val cache = scala.collection.mutable.Map[K, WeakReference[V]]()

      def get(key: K): Option[V] = {
        cache.get(key).flatMap(ref => Option(ref.get()))
      }

      def put(key: K, value: V): Unit = {
        cache(key) = new WeakReference(value)
      }

      def cleanup(): Unit = {
        cache.retain { case (_, ref) => ref.get() != null }
      }

      def size: Int = cache.size
    }

    // Memory-sensitive cache using SoftReference
    class SoftCache[K, V] {
      private val cache = scala.collection.mutable.Map[K, SoftReference[V]]()

      def get(key: K): Option[V] = {
        cache.get(key).flatMap(ref => Option(ref.get()))
      }

      def put(key: K, value: V): Unit = {
        cache(key) = new SoftReference(value)
      }

      def cleanup(): Unit = {
        cache.retain { case (_, ref) => ref.get() != null }
      }
    }
  }

  // Memory usage monitoring
  def measureMemoryUsage[T](operation: => T): (T, Long) = {
    System.gc()
    Thread.sleep(100)
    val memoryBefore = getUsedMemory()

    val result = operation

    System.gc()
    Thread.sleep(100)
    val memoryAfter = getUsedMemory()

    (result, memoryAfter - memoryBefore)
  }

  private def getUsedMemory(): Long = {
    val runtime = Runtime.getRuntime
    runtime.totalMemory() - runtime.freeMemory()
  }
}

// Garbage collection optimization strategies
object GCOptimization {

  // Allocation rate monitoring
  class AllocationRateMonitor {
    @volatile private var lastGcTime = System.currentTimeMillis()
    @volatile private var lastAllocatedBytes = getAllocatedBytes()

    def getCurrentAllocationRate(): Double = {
      val currentTime = System.currentTimeMillis()
      val currentAllocated = getAllocatedBytes()

      val timeDelta = currentTime - lastGcTime
      val bytesDelta = currentAllocated - lastAllocatedBytes

      lastGcTime = currentTime
      lastAllocatedBytes = currentAllocated

      if (timeDelta > 0) bytesDelta.toDouble / timeDelta else 0.0
    }

    private def getAllocatedBytes(): Long = {
      ManagementFactory.getMemoryPoolMXBeans.asScala
        .filter(_.getType == java.lang.management.MemoryType.HEAP)
        .map(_.getUsage.getUsed)
        .sum
    }
  }

  // GC-friendly programming patterns
  object GCFriendlyPatterns {

    // Reuse collections instead of creating new ones
    def processDataGCFriendly[T](data: List[T], processor: T => T): List[T] = {
      val result = scala.collection.mutable.ListBuffer[T]()
      result.sizeHint(data.length)

      data.foreach { item =>
        result += processor(item)
      }

      result.toList
    }

    // Minimize boxing/unboxing
    def sumIntsEfficiently(numbers: Array[Int]): Long = {
      var sum = 0L
      var i = 0
      while (i < numbers.length) {
        sum += numbers(i)
        i += 1
      }
      sum
    }

    // Use primitive collections when possible
    import scala.collection.mutable.ArrayBuffer

    def efficientIntProcessing(size: Int): ArrayBuffer[Int] = {
      val buffer = new ArrayBuffer[Int](size)
      var i = 0
      while (i < size) {
        buffer += i * 2
        i += 1
      }
      buffer
    }

    // Streaming processing to reduce memory pressure
    def processLargeDataset[T, U](
      source: Iterator[T],
      batchSize: Int,
      processor: List[T] => List[U]
    ): Iterator[U] = {
      source.grouped(batchSize).flatMap { batch =>
        processor(batch.toList)
      }
    }
  }

  // JVM tuning recommendations
  object JVMTuning {

    case class GCTuningRecommendations(
      heapSize: String,
      gcAlgorithm: String,
      additionalFlags: List[String],
      reasoning: String
    )

    def recommendGCSettings(
      applicationProfile: String,
      expectedHeapUsage: Long,
      latencyRequirements: Duration
    ): GCTuningRecommendations = {

      applicationProfile.toLowerCase match {
        case "low-latency" =>
          GCTuningRecommendations(
            heapSize = s"-Xmx${expectedHeapUsage * 2}m",
            gcAlgorithm = "-XX:+UseZGC",
            additionalFlags = List(
              "-XX:+UnlockExperimentalVMOptions",
              "-XX:+UseTransparentHugePages",
              "-XX:+UncommitMemory"
            ),
            reasoning = "ZGC provides consistent low latency with minimal pause times"
          )

        case "high-throughput" =>
          GCTuningRecommendations(
            heapSize = s"-Xmx${expectedHeapUsage}m",
            gcAlgorithm = "-XX:+UseParallelGC",
            additionalFlags = List(
              "-XX:+UseAdaptiveSizePolicy",
              "-XX:MaxGCPauseMillis=200",
              "-XX:GCTimeRatio=19"
            ),
            reasoning = "Parallel GC optimizes for maximum throughput"
          )

        case "balanced" =>
          GCTuningRecommendations(
            heapSize = s"-Xmx${expectedHeapUsage}m",
            gcAlgorithm = "-XX:+UseG1GC",
            additionalFlags = List(
              "-XX:MaxGCPauseMillis=100",
              "-XX:G1HeapRegionSize=16m",
              "-XX:+G1UseAdaptiveIHOP"
            ),
            reasoning = "G1GC provides good balance between latency and throughput"
          )

        case _ =>
          GCTuningRecommendations(
            heapSize = s"-Xmx${expectedHeapUsage}m",
            gcAlgorithm = "-XX:+UseG1GC",
            additionalFlags = List("-XX:MaxGCPauseMillis=100"),
            reasoning = "Default G1GC settings for general purpose applications"
          )
      }
    }
  }
}

Profiling Tools and Techniques

Application Performance Monitoring

import java.lang.management.ManagementFactory
import javax.management.ObjectName
import scala.jdk.CollectionConverters._

// Comprehensive application profiler
class ApplicationProfiler {

  case class ProfilerSession(
    sessionId: String,
    startTime: Long,
    endTime: Option[Long] = None,
    measurements: List[Measurement] = List.empty
  )

  sealed trait Measurement {
    def timestamp: Long
    def category: String
  }

  case class MethodInvocation(
    timestamp: Long,
    className: String,
    methodName: String,
    duration: Long,
    memoryAllocated: Long
  ) extends Measurement {
    val category = "method"
  }

  case class GCEvent(
    timestamp: Long,
    gcName: String,
    duration: Long,
    memoryBefore: Long,
    memoryAfter: Long
  ) extends Measurement {
    val category = "gc"
  }

  case class ThreadSnapshot(
    timestamp: Long,
    threadCount: Int,
    blockedThreads: Int,
    waitingThreads: Int,
    cpuUsage: Double
  ) extends Measurement {
    val category = "threads"
  }

  private var currentSession: Option[ProfilerSession] = None
  private val measurements = scala.collection.mutable.ListBuffer[Measurement]()

  def startSession(sessionId: String): Unit = {
    currentSession = Some(ProfilerSession(sessionId, System.currentTimeMillis()))
    measurements.clear()
  }

  def endSession(): Option[ProfilerSession] = {
    currentSession.map { session =>
      val endedSession = session.copy(
        endTime = Some(System.currentTimeMillis()),
        measurements = measurements.toList
      )
      currentSession = None
      endedSession
    }
  }

  // Method-level profiling using bytecode instrumentation (conceptual)
  def profileMethod[T](className: String, methodName: String)(operation: => T): T = {
    val startTime = System.nanoTime()
    val memoryBefore = getUsedMemory()

    val result = operation

    val endTime = System.nanoTime()
    val memoryAfter = getUsedMemory()

    val measurement = MethodInvocation(
      timestamp = System.currentTimeMillis(),
      className = className,
      methodName = methodName,
      duration = endTime - startTime,
      memoryAllocated = memoryAfter - memoryBefore
    )

    measurements += measurement
    result
  }

  // Continuous monitoring
  def startContinuousMonitoring(intervalMs: Long = 1000): Unit = {
    val monitoring = new Thread(() => {
      while (currentSession.isDefined) {
        captureSystemSnapshot()
        Thread.sleep(intervalMs)
      }
    })
    monitoring.setDaemon(true)
    monitoring.start()
  }

  private def captureSystemSnapshot(): Unit = {
    // GC monitoring
    captureGCEvents()

    // Thread monitoring
    captureThreadSnapshot()

    // Memory monitoring
    captureMemorySnapshot()
  }

  private def captureGCEvents(): Unit = {
    val gcBeans = ManagementFactory.getGarbageCollectorMXBeans.asScala
    gcBeans.foreach { gc =>
      val collectionCount = gc.getCollectionCount
      val collectionTime = gc.getCollectionTime

      // In a real implementation, we would track deltas
      if (collectionCount > 0) {
        val gcEvent = GCEvent(
          timestamp = System.currentTimeMillis(),
          gcName = gc.getName,
          duration = collectionTime,
          memoryBefore = 0, // Would track actual values
          memoryAfter = 0
        )
        measurements += gcEvent
      }
    }
  }

  private def captureThreadSnapshot(): Unit = {
    val threadBean = ManagementFactory.getThreadMXBean
    val threadInfo = threadBean.dumpAllThreads(false, false)

    val blockedCount = threadInfo.count(_.getThreadState == Thread.State.BLOCKED)
    val waitingCount = threadInfo.count(t => 
      t.getThreadState == Thread.State.WAITING || t.getThreadState == Thread.State.TIMED_WAITING
    )

    val cpuUsage = getCpuUsage()

    val snapshot = ThreadSnapshot(
      timestamp = System.currentTimeMillis(),
      threadCount = threadInfo.length,
      blockedThreads = blockedCount,
      waitingThreads = waitingCount,
      cpuUsage = cpuUsage
    )

    measurements += snapshot
  }

  private def captureMemorySnapshot(): Unit = {
    // Memory monitoring implementation
  }

  private def getUsedMemory(): Long = {
    val runtime = Runtime.getRuntime
    runtime.totalMemory() - runtime.freeMemory()
  }

  private def getCpuUsage(): Double = {
    val osBean = ManagementFactory.getOperatingSystemMXBean
    osBean match {
      case sunBean: com.sun.management.OperatingSystemMXBean =>
        sunBean.getProcessCpuLoad
      case _ => 0.0
    }
  }

  // Analysis and reporting
  def analyzeSession(session: ProfilerSession): PerformanceReport = {
    val methodMeasurements = session.measurements.collect { case m: MethodInvocation => m }
    val gcMeasurements = session.measurements.collect { case m: GCEvent => m }
    val threadMeasurements = session.measurements.collect { case m: ThreadSnapshot => m }

    val hotspots = findHotspots(methodMeasurements)
    val memoryLeaks = detectMemoryLeaks(methodMeasurements)
    val gcAnalysis = analyzeGC(gcMeasurements)
    val threadAnalysis = analyzeThreads(threadMeasurements)

    PerformanceReport(
      sessionId = session.sessionId,
      duration = session.endTime.getOrElse(System.currentTimeMillis()) - session.startTime,
      hotspots = hotspots,
      memoryLeaks = memoryLeaks,
      gcAnalysis = gcAnalysis,
      threadAnalysis = threadAnalysis
    )
  }

  private def findHotspots(methods: List[MethodInvocation]): List[Hotspot] = {
    methods.groupBy(m => s"${m.className}.${m.methodName}")
      .map { case (methodName, invocations) =>
        Hotspot(
          methodName = methodName,
          totalTime = invocations.map(_.duration).sum,
          invocationCount = invocations.length,
          averageTime = invocations.map(_.duration).sum / invocations.length,
          totalMemory = invocations.map(_.memoryAllocated).sum
        )
      }
      .toList
      .sortBy(_.totalTime)
      .reverse
      .take(10)
  }

  private def detectMemoryLeaks(methods: List[MethodInvocation]): List[MemoryLeak] = {
    // Simplified memory leak detection
    List.empty
  }

  private def analyzeGC(gcEvents: List[GCEvent]): GCAnalysis = {
    val totalGCTime = gcEvents.map(_.duration).sum
    val gcCount = gcEvents.length
    val averageGCTime = if (gcCount > 0) totalGCTime / gcCount else 0

    GCAnalysis(
      totalGCTime = totalGCTime,
      gcCount = gcCount,
      averageGCTime = averageGCTime,
      maxGCTime = gcEvents.map(_.duration).maxOption.getOrElse(0)
    )
  }

  private def analyzeThreads(threadSnapshots: List[ThreadSnapshot]): ThreadAnalysis = {
    val maxThreads = threadSnapshots.map(_.threadCount).maxOption.getOrElse(0)
    val avgCpuUsage = threadSnapshots.map(_.cpuUsage).sum / threadSnapshots.length
    val maxBlockedThreads = threadSnapshots.map(_.blockedThreads).maxOption.getOrElse(0)

    ThreadAnalysis(
      maxThreadCount = maxThreads,
      averageCpuUsage = avgCpuUsage,
      maxBlockedThreads = maxBlockedThreads
    )
  }
}

case class PerformanceReport(
  sessionId: String,
  duration: Long,
  hotspots: List[Hotspot],
  memoryLeaks: List[MemoryLeak],
  gcAnalysis: GCAnalysis,
  threadAnalysis: ThreadAnalysis
)

case class Hotspot(
  methodName: String,
  totalTime: Long,
  invocationCount: Int,
  averageTime: Long,
  totalMemory: Long
)

case class MemoryLeak(
  location: String,
  suspectedObjects: List[String],
  growthRate: Double
)

case class GCAnalysis(
  totalGCTime: Long,
  gcCount: Int,
  averageGCTime: Long,
  maxGCTime: Long
)

case class ThreadAnalysis(
  maxThreadCount: Int,
  averageCpuUsage: Double,
  maxBlockedThreads: Int
)

// Integration with external profiling tools
object ExternalProfilers {

  // JProfiler integration
  object JProfilerIntegration {
    def startCPUProfiling(): Unit = {
      // JProfiler API calls (when available)
      try {
        val controller = Class.forName("com.jprofiler.api.agent.Controller")
        val startCPURecording = controller.getMethod("startCPURecording", classOf[Boolean])
        startCPURecording.invoke(null, java.lang.Boolean.TRUE)
      } catch {
        case _: Exception => println("JProfiler not available")
      }
    }

    def stopCPUProfiling(): Unit = {
      try {
        val controller = Class.forName("com.jprofiler.api.agent.Controller")
        val stopCPURecording = controller.getMethod("stopCPURecording")
        stopCPURecording.invoke(null)
      } catch {
        case _: Exception => // Ignore
      }
    }
  }

  // JFR (Java Flight Recorder) integration
  object JFRIntegration {
    def startRecording(duration: Duration, filename: String): Unit = {
      val command = s"jcmd ${ProcessHandle.current().pid()} JFR.start duration=${duration.toSeconds}s filename=$filename"
      Runtime.getRuntime.exec(command)
    }

    def stopRecording(): Unit = {
      val command = s"jcmd ${ProcessHandle.current().pid()} JFR.stop"
      Runtime.getRuntime.exec(command)
    }
  }

  // Async Profiler integration
  object AsyncProfilerIntegration {
    def profile(duration: Duration, event: String = "cpu"): Unit = {
      // Assuming async-profiler is available
      val pid = ProcessHandle.current().pid()
      val command = s"java -jar async-profiler.jar -e $event -d ${duration.toSeconds} $pid"
      Runtime.getRuntime.exec(command)
    }
  }
}

Conclusion

Performance optimization in Scala requires a systematic approach combining measurement, analysis, and targeted improvements. Key principles include:

Measurement Foundation:

  • Establish baseline performance metrics
  • Use micro-benchmarks for specific operations
  • Monitor production performance continuously
  • Profile memory allocation and GC behavior

Optimization Strategies:

  • Algorithm and data structure optimization
  • Memory management and allocation reduction
  • JVM tuning and garbage collection optimization
  • Concurrent programming optimizations

Profiling Techniques:

  • Method-level performance monitoring
  • Memory allocation tracking
  • GC event analysis
  • Thread contention detection

Tools and Frameworks:

  • Built-in JVM monitoring capabilities
  • External profilers (JProfiler, VisualVM, Async Profiler)
  • Java Flight Recorder for production monitoring
  • Custom profiling frameworks for specific needs

Best Practices:

  • Measure before optimizing
  • Focus on hotspots and critical paths
  • Consider memory allocation patterns
  • Balance between different optimization goals
  • Test optimizations under realistic conditions

Common Pitfalls to Avoid:

  • Premature optimization without measurement
  • Optimizing non-critical code paths
  • Ignoring memory allocation impact
  • Over-optimization leading to complex code
  • Not considering concurrent access patterns

Production Considerations:

  • Monitor key performance indicators
  • Set up alerting for performance degradation
  • Regular performance regression testing
  • Capacity planning based on performance characteristics
  • A/B testing for optimization effectiveness

Performance optimization is an iterative process that requires careful measurement, analysis, and validation. The goal is to build applications that are not only correct but also efficient, scalable, and maintainable. Modern Scala applications can achieve excellent performance when these principles are applied systematically.