NavidromeApp/iOS/Views/Visualizer/OfflineAudioAnalyzer.swift

import Foundation
import AVFoundation
import Accelerate

/// Processes an entire audio file faster than real-time, producing per-frame level data
/// that can be cached and played back in sync with the audio.
/// Also optionally extracts SmartDJ profile data (silence boundaries + LUFS) in the same pass.
actor OfflineAudioAnalyzer {
    static let shared = OfflineAudioAnalyzer()

    typealias ProgressCallback = @Sendable (Float) -> Void

    // MARK: - Combined Analysis Result

    struct CombinedResult {
        let visFrames: [[Float]]
        let silenceEnd: Double?    // leading silence end in seconds
        let silenceStart: Double?  // trailing silence start in seconds
        let loudnessLUFS: Double?  // approximate integrated loudness
    }

    // MARK: - Visualizer-only (legacy entry point)

    func analyze(
        url: URL,
        pointsCount: Int = 20,
        fps: Double = 30.0,
        cutoff: Int = 90,
        progress: ProgressCallback? = nil
    ) throws -> [[Float]] {
        let r = try analyzeWithSmartDJ(url: url, pointsCount: pointsCount, fps: fps,
                                       cutoff: cutoff, extractSmartDJ: false, progress: progress)
        return r.visFrames
    }

    // MARK: - Combined pass: vis frames + SmartDJ profile in one file read

    func analyzeWithSmartDJ(
        url: URL,
        pointsCount: Int = 20,
        fps: Double = 30.0,
        cutoff: Int = 90,
        extractSmartDJ: Bool = true,
        progress: ProgressCallback? = nil
    ) throws -> CombinedResult {

        let file = try AVAudioFile(forReading: url)
        let format = file.processingFormat
        let sampleRate = format.sampleRate
        let totalFrames = file.length
        let durationSec = Double(totalFrames) / sampleRate

        // FFT parameters — always 1024 regardless of fps
        let fftSize = 1024
        let halfSize = fftSize / 2
        let log2n = vDSP_Length(log2(Double(fftSize)))

        guard let fftSetup = vDSP_create_fftsetup(log2n, Int32(kFFTRadix2)) else {
            throw NSError(domain: "OfflineAnalyzer", code: 2,
                          userInfo: [NSLocalizedDescriptionKey: "Failed to create FFT setup"])
        }
        defer { vDSP_destroy_fftsetup(fftSetup) }

        // Hann window
        var window = [Float](repeating: 0, count: fftSize)
        vDSP_hann_window(&window, vDSP_Length(fftSize), Int32(vDSP_HANN_NORM))

        // How many AUDIO samples between each vis frame.
        // This is independent of fftSize — the FFT window always uses fftSize
        // samples but hops forward by hopSize each frame, giving proper overlap
        // when fps is high (hopSize < fftSize) without skipping samples when fps
        // is low (hopSize > fftSize).
        let hopSize = Int(max(1, sampleRate / fps))

        // Read in chunks large enough to hold at least one full FFT window.
        // Using 4× hopSize so we get several vis frames per disk read.
        let readChunkSamples = max(fftSize * 2, hopSize * 4)
        guard let readBuffer = AVAudioPCMBuffer(
            pcmFormat: format,
            frameCapacity: AVAudioFrameCount(readChunkSamples)
        ) else {
            throw NSError(domain: "OfflineAnalyzer", code: 1,
                          userInfo: [NSLocalizedDescriptionKey: "Failed to create buffer"])
        }

        // Ring buffer: always holds the last `fftSize` samples plus one chunk ahead
        let ringCapacity = readChunkSamples + fftSize
        var ring = [Float](repeating: 0, count: ringCapacity)
        var ringHead = 0           // next write position (mod ringCapacity)
        var totalSamplesInRing = 0 // total samples ever written to ring

        let estimatedFrames = Int(durationSec * fps) + 1
        var rawFrames: [[Float]] = []
        rawFrames.reserveCapacity(estimatedFrames)

        // SmartDJ accumulators
        let silenceThreshold: Float = 0.008
        var leadingSilenceEndSec: Double? = nil
        var trailingSilenceStartSec: Double? = nil
        var sumSquares: Double = 0
        var sampleCountLUFS: Int64 = 0

        // Sliding window state
        var nextFrameSample = 0   // the audio sample index at which to take the next vis frame

        while file.framePosition < totalFrames {
            // Cooperatively cancel if the app backgrounded mid-analysis
            try Task.checkCancellation()

            let toRead = min(AVAudioFrameCount(readChunkSamples),
                             AVAudioFrameCount(totalFrames - file.framePosition))
            readBuffer.frameLength = 0
            try file.read(into: readBuffer, frameCount: toRead)
            let chunkStart = Int(file.framePosition) - Int(readBuffer.frameLength)
            let chunkLen = Int(readBuffer.frameLength)
            guard chunkLen > 0, let ch = readBuffer.floatChannelData?[0] else { continue }

            // Write chunk into ring buffer
            for i in 0..<chunkLen {
                ring[ringHead] = ch[i]
                ringHead = (ringHead + 1) % ringCapacity
            }
            totalSamplesInRing += chunkLen

            // SmartDJ RMS & loudness over this chunk
            if extractSmartDJ {
                var rms: Float = 0
                vDSP_rmsqv(ch, 1, &rms, vDSP_Length(chunkLen))
                let chunkSec = Double(chunkStart) / sampleRate
                if rms > silenceThreshold {
                    if leadingSilenceEndSec == nil { leadingSilenceEndSec = chunkSec }
                    trailingSilenceStartSec = chunkSec + Double(chunkLen) / sampleRate
                }
                var sumSq: Float = 0
                vDSP_measqv(ch, 1, &sumSq, vDSP_Length(chunkLen))
                sumSquares += Double(sumSq) * Double(chunkLen)
                sampleCountLUFS += Int64(chunkLen)
            }

            // Generate vis frames for all frame positions inside this chunk
            let chunkEnd = chunkStart + chunkLen

            while nextFrameSample < chunkEnd {
                // Check cancellation every frame — the inner loop is the hot path
                try Task.checkCancellation()
                // We need fftSize samples ending at nextFrameSample + fftSize/2
                // (centre the FFT window on the frame position for better transient response)
                let windowStart = nextFrameSample - fftSize / 2
                let windowEnd = windowStart + fftSize

                // Skip if we don't have enough samples yet
                guard windowEnd <= Int(file.framePosition) else { break }
                guard windowStart >= 0 else {
                    nextFrameSample += hopSize
                    continue
                }

                // Extract fftSize samples from ring buffer
                // The ring buffer contains samples [totalSamplesInRing-ringCapacity ... totalSamplesInRing]
                // (clamped to what we've written so far)
                let ringTail = totalSamplesInRing - ringCapacity
                guard windowStart >= ringTail else {
                    nextFrameSample += hopSize
                    continue
                }

                var windowSamples = [Float](repeating: 0, count: fftSize)
                for j in 0..<fftSize {
                    let absIdx = windowStart + j
                    let ringIdx = (ringHead - (totalSamplesInRing - absIdx) + ringCapacity * 1000) % ringCapacity
                    windowSamples[j] = ring[ringIdx]
                }

                let frame = computeFFTFrame(
                    samples: windowSamples,
                    fftSize: fftSize, halfSize: halfSize,
                    window: window, fftSetup: fftSetup,
                    pointsCount: pointsCount, cutoff: cutoff
                )
                rawFrames.append(frame)

                nextFrameSample += hopSize

                if rawFrames.count % 100 == 0 {
                    progress?(Float(nextFrameSample) / Float(totalFrames))
                }
            }
        }
        progress?(1.0)

        // ── Normalize frames to 0–1 using 95th-percentile peak ──────────────
        // This ensures consistent amplitude across songs regardless of mastering
        // level, and makes pre-analyzed playback feel identical to the live FFT
        // path which also normalises against a peak follower.
        let normalized = normalizeFrames(rawFrames)

        // ── Apply per-frame temporal smoothing ───────────────────────────────
        // The live path smooths in updateDisplayLevels. Pre-analyzed frames need
        // the same treatment baked in so playback isn't jittery.
        let viscosity: Float = 0.25  // matches default VisualizerSettings.viscosity
        let smoothed = smoothFrames(normalized, viscosity: viscosity)

        // ── LUFS ─────────────────────────────────────────────────────────────
        var loudnessLUFS: Double? = nil
        if extractSmartDJ && sampleCountLUFS > 0 {
            let meanSquare = sumSquares / Double(sampleCountLUFS)
            if meanSquare > 0 {
                loudnessLUFS = 20.0 * log10(sqrt(meanSquare))
            }
        }

        // ── Silence guard ────────────────────────────────────────────────────
        let safeLeading: Double? = {
            guard let t = leadingSilenceEndSec, t > 0.05, t < durationSec * 0.25 else { return nil }
            return t
        }()
        let safeTrailing: Double? = {
            guard let t = trailingSilenceStartSec,
                  t < durationSec - 0.5, t > durationSec * 0.5 else { return nil }
            return t
        }()

        return CombinedResult(
            visFrames: smoothed,
            silenceEnd: safeLeading,
            silenceStart: safeTrailing,
            loudnessLUFS: loudnessLUFS
        )
    }

    // MARK: - FFT Frame

    private func computeFFTFrame(
        samples: [Float],
        fftSize: Int,
        halfSize: Int,
        window: [Float],
        fftSetup: FFTSetup,
        pointsCount: Int,
        cutoff: Int
    ) -> [Float] {
        // Apply Hann window
        var windowed = [Float](repeating: 0, count: fftSize)
        vDSP_vmul(samples, 1, window, 1, &windowed, 1, vDSP_Length(fftSize))

        // FFT
        var realp = [Float](repeating: 0, count: halfSize)
        var imagp = [Float](repeating: 0, count: halfSize)
        var magnitudes = [Float](repeating: 0, count: halfSize)

        realp.withUnsafeMutableBufferPointer { rb in
            imagp.withUnsafeMutableBufferPointer { ib in
                var sc = DSPSplitComplex(realp: rb.baseAddress!, imagp: ib.baseAddress!)
                windowed.withUnsafeBytes { raw in
                    vDSP_ctoz(raw.bindMemory(to: DSPComplex.self).baseAddress!,
                              2, &sc, 1, vDSP_Length(halfSize))
                }
                let log2n = vDSP_Length(log2(Double(fftSize)))
                vDSP_fft_zrip(fftSetup, &sc, 1, log2n, FFTDirection(FFT_FORWARD))
                vDSP_zvmags(&sc, 1, &magnitudes, 1, vDSP_Length(halfSize))
            }
        }

        // Normalize: divide by N², then sqrt for perceptual amplitude
        let n2 = Float(fftSize) * Float(fftSize)
        var scale = 1.0 / n2
        vDSP_vsmul(magnitudes, 1, &scale, &magnitudes, 1, vDSP_Length(halfSize))
        for i in 0..<halfSize { magnitudes[i] = sqrt(magnitudes[i]) }

        // Log-spaced binning — uniform bin width, matching the fixed live-path binning.
        // No eqBoost: frequency bands are weighted equally, matching the original Mitsuha behaviour.
        var frame = [Float](repeating: 0, count: pointsCount)
        let maxBin = min(halfSize - 1, cutoff)
        let uniformBinWidth = max(1, maxBin / pointsCount)

        for i in 0..<pointsCount {
            let nIdx = Float(i + 1) / Float(pointsCount)
            let logIdx = log10(nIdx * 9.0 + 1.0)          // 0→1 log-spaced
            let centerBin = Int(logIdx * Float(maxBin))
            let lo = max(1, centerBin - uniformBinWidth / 2)
            let hi = min(maxBin, centerBin + uniformBinWidth / 2)
            var sum: Float = 0
            var count = 0
            for j in lo...hi where j < magnitudes.count {
                sum += magnitudes[j]
                count += 1
            }
            frame[i] = count > 0 ? sum / Float(count) : 0
        }
        return frame
    }

    // MARK: - Post-processing

    /// Normalize all frames so the 95th-percentile peak maps to 0.8.
    /// This keeps loud transients visible without clipping, and ensures a quiet
    /// song fills the visualizer at the same apparent height as a loud one.
    private func normalizeFrames(_ frames: [[Float]]) -> [[Float]] {
        guard !frames.isEmpty else { return frames }

        // Collect all non-zero values to find the 95th percentile
        var allValues: [Float] = []
        allValues.reserveCapacity(frames.count * (frames.first?.count ?? 1))
        for frame in frames {
            for v in frame where v > 0 { allValues.append(v) }
        }
        guard !allValues.isEmpty else { return frames }

        allValues.sort()
        let p95idx = min(Int(Float(allValues.count) * 0.95), allValues.count - 1)
        let p95 = allValues[p95idx]
        guard p95 > 0 else { return frames }

        let scale = 0.8 / p95
        return frames.map { frame in frame.map { min(1.0, $0 * scale) } }
    }

    /// Bake temporal smoothing into the frames so pre-analyzed playback
    /// looks identical to the live FFT path (which smooths in updateDisplayLevels).
    private func smoothFrames(_ frames: [[Float]], viscosity: Float) -> [[Float]] {
        guard frames.count > 1 else { return frames }
        var result = frames
        var prev = frames[0]
        for i in 1..<frames.count {
            var smoothed = [Float](repeating: 0, count: prev.count)
            let n = min(prev.count, frames[i].count)
            for j in 0..<n {
                smoothed[j] = prev[j] + (frames[i][j] - prev[j]) * viscosity
            }
            result[i] = smoothed
            prev = smoothed
        }
        return result
    }
}