NavidromeApp/iOS/Views/Visualizer/OfflineAudioAnalyzer.swift

336 lines
14 KiB
Swift
Raw Normal View History

import Foundation
import AVFoundation
import Accelerate
2026-04-09 23:39:52 -07:00
/// Processes an entire audio file faster than real-time, producing per-frame level data
/// that can be cached and played back in sync with the audio.
/// Also optionally extracts SmartDJ profile data (silence boundaries + LUFS) in the same pass.
actor OfflineAudioAnalyzer {
static let shared = OfflineAudioAnalyzer()
typealias ProgressCallback = @Sendable (Float) -> Void
// MARK: - Combined Analysis Result
struct CombinedResult {
let visFrames: [[Float]]
let silenceEnd: Double? // leading silence end in seconds
let silenceStart: Double? // trailing silence start in seconds
let loudnessLUFS: Double? // approximate integrated loudness
}
// MARK: - Visualizer-only (legacy entry point)
func analyze(
url: URL,
pointsCount: Int = 20,
fps: Double = 30.0,
cutoff: Int = 90,
progress: ProgressCallback? = nil
) throws -> [[Float]] {
let r = try analyzeWithSmartDJ(url: url, pointsCount: pointsCount, fps: fps,
2026-04-09 23:39:52 -07:00
cutoff: cutoff, extractSmartDJ: false, progress: progress)
return r.visFrames
}
// MARK: - Combined pass: vis frames + SmartDJ profile in one file read
func analyzeWithSmartDJ(
url: URL,
pointsCount: Int = 20,
fps: Double = 30.0,
cutoff: Int = 90,
extractSmartDJ: Bool = true,
progress: ProgressCallback? = nil
) throws -> CombinedResult {
let file = try AVAudioFile(forReading: url)
let format = file.processingFormat
let sampleRate = format.sampleRate
let totalFrames = file.length
let durationSec = Double(totalFrames) / sampleRate
2026-04-09 23:39:52 -07:00
// FFT parameters always 1024 regardless of fps
let fftSize = 1024
2026-04-09 23:39:52 -07:00
let halfSize = fftSize / 2
let log2n = vDSP_Length(log2(Double(fftSize)))
2026-04-09 23:39:52 -07:00
guard let fftSetup = vDSP_create_fftsetup(log2n, Int32(kFFTRadix2)) else {
2026-04-09 23:39:52 -07:00
throw NSError(domain: "OfflineAnalyzer", code: 2,
userInfo: [NSLocalizedDescriptionKey: "Failed to create FFT setup"])
}
defer { vDSP_destroy_fftsetup(fftSetup) }
2026-04-09 23:39:52 -07:00
// Hann window
var window = [Float](repeating: 0, count: fftSize)
vDSP_hann_window(&window, vDSP_Length(fftSize), Int32(vDSP_HANN_NORM))
2026-04-09 23:39:52 -07:00
// How many AUDIO samples between each vis frame.
// This is independent of fftSize the FFT window always uses fftSize
// samples but hops forward by hopSize each frame, giving proper overlap
// when fps is high (hopSize < fftSize) without skipping samples when fps
// is low (hopSize > fftSize).
let hopSize = Int(max(1, sampleRate / fps))
// Read in chunks large enough to hold at least one full FFT window.
// Using 4× hopSize so we get several vis frames per disk read.
let readChunkSamples = max(fftSize * 2, hopSize * 4)
guard let readBuffer = AVAudioPCMBuffer(
pcmFormat: format,
frameCapacity: AVAudioFrameCount(readChunkSamples)
) else {
throw NSError(domain: "OfflineAnalyzer", code: 1,
userInfo: [NSLocalizedDescriptionKey: "Failed to create buffer"])
}
// Ring buffer: always holds the last `fftSize` samples plus one chunk ahead
let ringCapacity = readChunkSamples + fftSize
var ring = [Float](repeating: 0, count: ringCapacity)
var ringHead = 0 // next write position (mod ringCapacity)
var totalSamplesInRing = 0 // total samples ever written to ring
let estimatedFrames = Int(durationSec * fps) + 1
var rawFrames: [[Float]] = []
rawFrames.reserveCapacity(estimatedFrames)
// SmartDJ accumulators
let silenceThreshold: Float = 0.008
var leadingSilenceEndSec: Double? = nil
var trailingSilenceStartSec: Double? = nil
var sumSquares: Double = 0
var sampleCountLUFS: Int64 = 0
2026-04-09 23:39:52 -07:00
// Sliding window state
var nextFrameSample = 0 // the audio sample index at which to take the next vis frame
while file.framePosition < totalFrames {
2026-04-10 17:50:26 -07:00
// Cooperatively cancel if the app backgrounded mid-analysis
try Task.checkCancellation()
2026-04-09 23:39:52 -07:00
let toRead = min(AVAudioFrameCount(readChunkSamples),
AVAudioFrameCount(totalFrames - file.framePosition))
readBuffer.frameLength = 0
try file.read(into: readBuffer, frameCount: toRead)
let chunkStart = Int(file.framePosition) - Int(readBuffer.frameLength)
let chunkLen = Int(readBuffer.frameLength)
guard chunkLen > 0, let ch = readBuffer.floatChannelData?[0] else { continue }
2026-04-09 23:39:52 -07:00
// Write chunk into ring buffer
for i in 0..<chunkLen {
ring[ringHead] = ch[i]
ringHead = (ringHead + 1) % ringCapacity
}
totalSamplesInRing += chunkLen
2026-04-09 23:39:52 -07:00
// SmartDJ RMS & loudness over this chunk
if extractSmartDJ {
var rms: Float = 0
2026-04-09 23:39:52 -07:00
vDSP_rmsqv(ch, 1, &rms, vDSP_Length(chunkLen))
let chunkSec = Double(chunkStart) / sampleRate
if rms > silenceThreshold {
2026-04-09 23:39:52 -07:00
if leadingSilenceEndSec == nil { leadingSilenceEndSec = chunkSec }
trailingSilenceStartSec = chunkSec + Double(chunkLen) / sampleRate
}
var sumSq: Float = 0
2026-04-09 23:39:52 -07:00
vDSP_measqv(ch, 1, &sumSq, vDSP_Length(chunkLen))
sumSquares += Double(sumSq) * Double(chunkLen)
sampleCountLUFS += Int64(chunkLen)
}
2026-04-09 23:39:52 -07:00
// Generate vis frames for all frame positions inside this chunk
let chunkEnd = chunkStart + chunkLen
while nextFrameSample < chunkEnd {
2026-04-10 17:50:26 -07:00
// Check cancellation every frame the inner loop is the hot path
try Task.checkCancellation()
2026-04-09 23:39:52 -07:00
// We need fftSize samples ending at nextFrameSample + fftSize/2
// (centre the FFT window on the frame position for better transient response)
let windowStart = nextFrameSample - fftSize / 2
let windowEnd = windowStart + fftSize
// Skip if we don't have enough samples yet
guard windowEnd <= Int(file.framePosition) else { break }
guard windowStart >= 0 else {
nextFrameSample += hopSize
continue
}
// Extract fftSize samples from ring buffer
// The ring buffer contains samples [totalSamplesInRing-ringCapacity ... totalSamplesInRing]
// (clamped to what we've written so far)
let ringTail = totalSamplesInRing - ringCapacity
guard windowStart >= ringTail else {
nextFrameSample += hopSize
continue
}
2026-04-09 23:39:52 -07:00
var windowSamples = [Float](repeating: 0, count: fftSize)
for j in 0..<fftSize {
let absIdx = windowStart + j
let ringIdx = (ringHead - (totalSamplesInRing - absIdx) + ringCapacity * 1000) % ringCapacity
windowSamples[j] = ring[ringIdx]
}
let frame = computeFFTFrame(
samples: windowSamples,
fftSize: fftSize, halfSize: halfSize,
window: window, fftSetup: fftSetup,
pointsCount: pointsCount, cutoff: cutoff
)
rawFrames.append(frame)
nextFrameSample += hopSize
if rawFrames.count % 100 == 0 {
progress?(Float(nextFrameSample) / Float(totalFrames))
}
}
}
progress?(1.0)
2026-04-09 23:39:52 -07:00
// Normalize frames to 01 using 95th-percentile peak
// This ensures consistent amplitude across songs regardless of mastering
// level, and makes pre-analyzed playback feel identical to the live FFT
// path which also normalises against a peak follower.
let normalized = normalizeFrames(rawFrames)
// Apply per-frame temporal smoothing
// The live path smooths in updateDisplayLevels. Pre-analyzed frames need
// the same treatment baked in so playback isn't jittery.
let viscosity: Float = 0.25 // matches default VisualizerSettings.viscosity
let smoothed = smoothFrames(normalized, viscosity: viscosity)
// LUFS
var loudnessLUFS: Double? = nil
2026-04-09 23:39:52 -07:00
if extractSmartDJ && sampleCountLUFS > 0 {
let meanSquare = sumSquares / Double(sampleCountLUFS)
if meanSquare > 0 {
2026-04-09 23:39:52 -07:00
loudnessLUFS = 20.0 * log10(sqrt(meanSquare))
}
}
2026-04-09 23:39:52 -07:00
// Silence guard
let safeLeading: Double? = {
guard let t = leadingSilenceEndSec, t > 0.05, t < durationSec * 0.25 else { return nil }
return t
}()
let safeTrailing: Double? = {
2026-04-09 23:39:52 -07:00
guard let t = trailingSilenceStartSec,
t < durationSec - 0.5, t > durationSec * 0.5 else { return nil }
return t
}()
return CombinedResult(
2026-04-09 23:39:52 -07:00
visFrames: smoothed,
silenceEnd: safeLeading,
silenceStart: safeTrailing,
loudnessLUFS: loudnessLUFS
)
}
2026-04-09 23:39:52 -07:00
// MARK: - FFT Frame
private func computeFFTFrame(
samples: [Float],
fftSize: Int,
halfSize: Int,
window: [Float],
fftSetup: FFTSetup,
pointsCount: Int,
2026-04-09 23:39:52 -07:00
cutoff: Int
) -> [Float] {
2026-04-09 23:39:52 -07:00
// Apply Hann window
var windowed = [Float](repeating: 0, count: fftSize)
2026-04-09 23:39:52 -07:00
vDSP_vmul(samples, 1, window, 1, &windowed, 1, vDSP_Length(fftSize))
// FFT
var realp = [Float](repeating: 0, count: halfSize)
var imagp = [Float](repeating: 0, count: halfSize)
var magnitudes = [Float](repeating: 0, count: halfSize)
2026-04-09 23:39:52 -07:00
realp.withUnsafeMutableBufferPointer { rb in
imagp.withUnsafeMutableBufferPointer { ib in
var sc = DSPSplitComplex(realp: rb.baseAddress!, imagp: ib.baseAddress!)
windowed.withUnsafeBytes { raw in
2026-04-09 23:39:52 -07:00
vDSP_ctoz(raw.bindMemory(to: DSPComplex.self).baseAddress!,
2, &sc, 1, vDSP_Length(halfSize))
}
2026-04-09 23:39:52 -07:00
let log2n = vDSP_Length(log2(Double(fftSize)))
vDSP_fft_zrip(fftSetup, &sc, 1, log2n, FFTDirection(FFT_FORWARD))
vDSP_zvmags(&sc, 1, &magnitudes, 1, vDSP_Length(halfSize))
}
}
2026-04-09 23:39:52 -07:00
// Normalize: divide by N², then sqrt for perceptual amplitude
let n2 = Float(fftSize) * Float(fftSize)
var scale = 1.0 / n2
vDSP_vsmul(magnitudes, 1, &scale, &magnitudes, 1, vDSP_Length(halfSize))
2026-04-09 23:39:52 -07:00
for i in 0..<halfSize { magnitudes[i] = sqrt(magnitudes[i]) }
// Log-spaced binning uniform bin width, matching the fixed live-path binning.
// No eqBoost: frequency bands are weighted equally, matching the original Mitsuha behaviour.
var frame = [Float](repeating: 0, count: pointsCount)
let maxBin = min(halfSize - 1, cutoff)
let uniformBinWidth = max(1, maxBin / pointsCount)
for i in 0..<pointsCount {
2026-04-09 23:39:52 -07:00
let nIdx = Float(i + 1) / Float(pointsCount)
let logIdx = log10(nIdx * 9.0 + 1.0) // 01 log-spaced
let centerBin = Int(logIdx * Float(maxBin))
let lo = max(1, centerBin - uniformBinWidth / 2)
let hi = min(maxBin, centerBin + uniformBinWidth / 2)
var sum: Float = 0
2026-04-09 23:39:52 -07:00
var count = 0
for j in lo...hi where j < magnitudes.count {
sum += magnitudes[j]
2026-04-09 23:39:52 -07:00
count += 1
}
frame[i] = count > 0 ? sum / Float(count) : 0
}
return frame
}
// MARK: - Post-processing
/// Normalize all frames so the 95th-percentile peak maps to 0.8.
/// This keeps loud transients visible without clipping, and ensures a quiet
/// song fills the visualizer at the same apparent height as a loud one.
private func normalizeFrames(_ frames: [[Float]]) -> [[Float]] {
guard !frames.isEmpty else { return frames }
// Collect all non-zero values to find the 95th percentile
var allValues: [Float] = []
allValues.reserveCapacity(frames.count * (frames.first?.count ?? 1))
for frame in frames {
for v in frame where v > 0 { allValues.append(v) }
}
guard !allValues.isEmpty else { return frames }
allValues.sort()
let p95idx = min(Int(Float(allValues.count) * 0.95), allValues.count - 1)
let p95 = allValues[p95idx]
guard p95 > 0 else { return frames }
let scale = 0.8 / p95
return frames.map { frame in frame.map { min(1.0, $0 * scale) } }
}
/// Bake temporal smoothing into the frames so pre-analyzed playback
/// looks identical to the live FFT path (which smooths in updateDisplayLevels).
private func smoothFrames(_ frames: [[Float]], viscosity: Float) -> [[Float]] {
guard frames.count > 1 else { return frames }
var result = frames
var prev = frames[0]
for i in 1..<frames.count {
var smoothed = [Float](repeating: 0, count: prev.count)
let n = min(prev.count, frames[i].count)
for j in 0..<n {
smoothed[j] = prev[j] + (frames[i][j] - prev[j]) * viscosity
}
2026-04-09 23:39:52 -07:00
result[i] = smoothed
prev = smoothed
}
2026-04-09 23:39:52 -07:00
return result
}
}