NavidromeApp/iOS/Views/Visualizer/OfflineAudioAnalyzer.swift
Dallas Groot 3cfcf026d7 fixes
2026-04-10 17:50:26 -07:00

335 lines
14 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import Foundation
import AVFoundation
import Accelerate
/// Processes an entire audio file faster than real-time, producing per-frame level data
/// that can be cached and played back in sync with the audio.
/// Also optionally extracts SmartDJ profile data (silence boundaries + LUFS) in the same pass.
actor OfflineAudioAnalyzer {
static let shared = OfflineAudioAnalyzer()
typealias ProgressCallback = @Sendable (Float) -> Void
// MARK: - Combined Analysis Result
struct CombinedResult {
let visFrames: [[Float]]
let silenceEnd: Double? // leading silence end in seconds
let silenceStart: Double? // trailing silence start in seconds
let loudnessLUFS: Double? // approximate integrated loudness
}
// MARK: - Visualizer-only (legacy entry point)
func analyze(
url: URL,
pointsCount: Int = 20,
fps: Double = 30.0,
cutoff: Int = 90,
progress: ProgressCallback? = nil
) throws -> [[Float]] {
let r = try analyzeWithSmartDJ(url: url, pointsCount: pointsCount, fps: fps,
cutoff: cutoff, extractSmartDJ: false, progress: progress)
return r.visFrames
}
// MARK: - Combined pass: vis frames + SmartDJ profile in one file read
func analyzeWithSmartDJ(
url: URL,
pointsCount: Int = 20,
fps: Double = 30.0,
cutoff: Int = 90,
extractSmartDJ: Bool = true,
progress: ProgressCallback? = nil
) throws -> CombinedResult {
let file = try AVAudioFile(forReading: url)
let format = file.processingFormat
let sampleRate = format.sampleRate
let totalFrames = file.length
let durationSec = Double(totalFrames) / sampleRate
// FFT parameters always 1024 regardless of fps
let fftSize = 1024
let halfSize = fftSize / 2
let log2n = vDSP_Length(log2(Double(fftSize)))
guard let fftSetup = vDSP_create_fftsetup(log2n, Int32(kFFTRadix2)) else {
throw NSError(domain: "OfflineAnalyzer", code: 2,
userInfo: [NSLocalizedDescriptionKey: "Failed to create FFT setup"])
}
defer { vDSP_destroy_fftsetup(fftSetup) }
// Hann window
var window = [Float](repeating: 0, count: fftSize)
vDSP_hann_window(&window, vDSP_Length(fftSize), Int32(vDSP_HANN_NORM))
// How many AUDIO samples between each vis frame.
// This is independent of fftSize the FFT window always uses fftSize
// samples but hops forward by hopSize each frame, giving proper overlap
// when fps is high (hopSize < fftSize) without skipping samples when fps
// is low (hopSize > fftSize).
let hopSize = Int(max(1, sampleRate / fps))
// Read in chunks large enough to hold at least one full FFT window.
// Using 4× hopSize so we get several vis frames per disk read.
let readChunkSamples = max(fftSize * 2, hopSize * 4)
guard let readBuffer = AVAudioPCMBuffer(
pcmFormat: format,
frameCapacity: AVAudioFrameCount(readChunkSamples)
) else {
throw NSError(domain: "OfflineAnalyzer", code: 1,
userInfo: [NSLocalizedDescriptionKey: "Failed to create buffer"])
}
// Ring buffer: always holds the last `fftSize` samples plus one chunk ahead
let ringCapacity = readChunkSamples + fftSize
var ring = [Float](repeating: 0, count: ringCapacity)
var ringHead = 0 // next write position (mod ringCapacity)
var totalSamplesInRing = 0 // total samples ever written to ring
let estimatedFrames = Int(durationSec * fps) + 1
var rawFrames: [[Float]] = []
rawFrames.reserveCapacity(estimatedFrames)
// SmartDJ accumulators
let silenceThreshold: Float = 0.008
var leadingSilenceEndSec: Double? = nil
var trailingSilenceStartSec: Double? = nil
var sumSquares: Double = 0
var sampleCountLUFS: Int64 = 0
// Sliding window state
var nextFrameSample = 0 // the audio sample index at which to take the next vis frame
while file.framePosition < totalFrames {
// Cooperatively cancel if the app backgrounded mid-analysis
try Task.checkCancellation()
let toRead = min(AVAudioFrameCount(readChunkSamples),
AVAudioFrameCount(totalFrames - file.framePosition))
readBuffer.frameLength = 0
try file.read(into: readBuffer, frameCount: toRead)
let chunkStart = Int(file.framePosition) - Int(readBuffer.frameLength)
let chunkLen = Int(readBuffer.frameLength)
guard chunkLen > 0, let ch = readBuffer.floatChannelData?[0] else { continue }
// Write chunk into ring buffer
for i in 0..<chunkLen {
ring[ringHead] = ch[i]
ringHead = (ringHead + 1) % ringCapacity
}
totalSamplesInRing += chunkLen
// SmartDJ RMS & loudness over this chunk
if extractSmartDJ {
var rms: Float = 0
vDSP_rmsqv(ch, 1, &rms, vDSP_Length(chunkLen))
let chunkSec = Double(chunkStart) / sampleRate
if rms > silenceThreshold {
if leadingSilenceEndSec == nil { leadingSilenceEndSec = chunkSec }
trailingSilenceStartSec = chunkSec + Double(chunkLen) / sampleRate
}
var sumSq: Float = 0
vDSP_measqv(ch, 1, &sumSq, vDSP_Length(chunkLen))
sumSquares += Double(sumSq) * Double(chunkLen)
sampleCountLUFS += Int64(chunkLen)
}
// Generate vis frames for all frame positions inside this chunk
let chunkEnd = chunkStart + chunkLen
while nextFrameSample < chunkEnd {
// Check cancellation every frame the inner loop is the hot path
try Task.checkCancellation()
// We need fftSize samples ending at nextFrameSample + fftSize/2
// (centre the FFT window on the frame position for better transient response)
let windowStart = nextFrameSample - fftSize / 2
let windowEnd = windowStart + fftSize
// Skip if we don't have enough samples yet
guard windowEnd <= Int(file.framePosition) else { break }
guard windowStart >= 0 else {
nextFrameSample += hopSize
continue
}
// Extract fftSize samples from ring buffer
// The ring buffer contains samples [totalSamplesInRing-ringCapacity ... totalSamplesInRing]
// (clamped to what we've written so far)
let ringTail = totalSamplesInRing - ringCapacity
guard windowStart >= ringTail else {
nextFrameSample += hopSize
continue
}
var windowSamples = [Float](repeating: 0, count: fftSize)
for j in 0..<fftSize {
let absIdx = windowStart + j
let ringIdx = (ringHead - (totalSamplesInRing - absIdx) + ringCapacity * 1000) % ringCapacity
windowSamples[j] = ring[ringIdx]
}
let frame = computeFFTFrame(
samples: windowSamples,
fftSize: fftSize, halfSize: halfSize,
window: window, fftSetup: fftSetup,
pointsCount: pointsCount, cutoff: cutoff
)
rawFrames.append(frame)
nextFrameSample += hopSize
if rawFrames.count % 100 == 0 {
progress?(Float(nextFrameSample) / Float(totalFrames))
}
}
}
progress?(1.0)
// Normalize frames to 01 using 95th-percentile peak
// This ensures consistent amplitude across songs regardless of mastering
// level, and makes pre-analyzed playback feel identical to the live FFT
// path which also normalises against a peak follower.
let normalized = normalizeFrames(rawFrames)
// Apply per-frame temporal smoothing
// The live path smooths in updateDisplayLevels. Pre-analyzed frames need
// the same treatment baked in so playback isn't jittery.
let viscosity: Float = 0.25 // matches default VisualizerSettings.viscosity
let smoothed = smoothFrames(normalized, viscosity: viscosity)
// LUFS
var loudnessLUFS: Double? = nil
if extractSmartDJ && sampleCountLUFS > 0 {
let meanSquare = sumSquares / Double(sampleCountLUFS)
if meanSquare > 0 {
loudnessLUFS = 20.0 * log10(sqrt(meanSquare))
}
}
// Silence guard
let safeLeading: Double? = {
guard let t = leadingSilenceEndSec, t > 0.05, t < durationSec * 0.25 else { return nil }
return t
}()
let safeTrailing: Double? = {
guard let t = trailingSilenceStartSec,
t < durationSec - 0.5, t > durationSec * 0.5 else { return nil }
return t
}()
return CombinedResult(
visFrames: smoothed,
silenceEnd: safeLeading,
silenceStart: safeTrailing,
loudnessLUFS: loudnessLUFS
)
}
// MARK: - FFT Frame
private func computeFFTFrame(
samples: [Float],
fftSize: Int,
halfSize: Int,
window: [Float],
fftSetup: FFTSetup,
pointsCount: Int,
cutoff: Int
) -> [Float] {
// Apply Hann window
var windowed = [Float](repeating: 0, count: fftSize)
vDSP_vmul(samples, 1, window, 1, &windowed, 1, vDSP_Length(fftSize))
// FFT
var realp = [Float](repeating: 0, count: halfSize)
var imagp = [Float](repeating: 0, count: halfSize)
var magnitudes = [Float](repeating: 0, count: halfSize)
realp.withUnsafeMutableBufferPointer { rb in
imagp.withUnsafeMutableBufferPointer { ib in
var sc = DSPSplitComplex(realp: rb.baseAddress!, imagp: ib.baseAddress!)
windowed.withUnsafeBytes { raw in
vDSP_ctoz(raw.bindMemory(to: DSPComplex.self).baseAddress!,
2, &sc, 1, vDSP_Length(halfSize))
}
let log2n = vDSP_Length(log2(Double(fftSize)))
vDSP_fft_zrip(fftSetup, &sc, 1, log2n, FFTDirection(FFT_FORWARD))
vDSP_zvmags(&sc, 1, &magnitudes, 1, vDSP_Length(halfSize))
}
}
// Normalize: divide by N², then sqrt for perceptual amplitude
let n2 = Float(fftSize) * Float(fftSize)
var scale = 1.0 / n2
vDSP_vsmul(magnitudes, 1, &scale, &magnitudes, 1, vDSP_Length(halfSize))
for i in 0..<halfSize { magnitudes[i] = sqrt(magnitudes[i]) }
// Log-spaced binning uniform bin width, matching the fixed live-path binning.
// No eqBoost: frequency bands are weighted equally, matching the original Mitsuha behaviour.
var frame = [Float](repeating: 0, count: pointsCount)
let maxBin = min(halfSize - 1, cutoff)
let uniformBinWidth = max(1, maxBin / pointsCount)
for i in 0..<pointsCount {
let nIdx = Float(i + 1) / Float(pointsCount)
let logIdx = log10(nIdx * 9.0 + 1.0) // 01 log-spaced
let centerBin = Int(logIdx * Float(maxBin))
let lo = max(1, centerBin - uniformBinWidth / 2)
let hi = min(maxBin, centerBin + uniformBinWidth / 2)
var sum: Float = 0
var count = 0
for j in lo...hi where j < magnitudes.count {
sum += magnitudes[j]
count += 1
}
frame[i] = count > 0 ? sum / Float(count) : 0
}
return frame
}
// MARK: - Post-processing
/// Normalize all frames so the 95th-percentile peak maps to 0.8.
/// This keeps loud transients visible without clipping, and ensures a quiet
/// song fills the visualizer at the same apparent height as a loud one.
private func normalizeFrames(_ frames: [[Float]]) -> [[Float]] {
guard !frames.isEmpty else { return frames }
// Collect all non-zero values to find the 95th percentile
var allValues: [Float] = []
allValues.reserveCapacity(frames.count * (frames.first?.count ?? 1))
for frame in frames {
for v in frame where v > 0 { allValues.append(v) }
}
guard !allValues.isEmpty else { return frames }
allValues.sort()
let p95idx = min(Int(Float(allValues.count) * 0.95), allValues.count - 1)
let p95 = allValues[p95idx]
guard p95 > 0 else { return frames }
let scale = 0.8 / p95
return frames.map { frame in frame.map { min(1.0, $0 * scale) } }
}
/// Bake temporal smoothing into the frames so pre-analyzed playback
/// looks identical to the live FFT path (which smooths in updateDisplayLevels).
private func smoothFrames(_ frames: [[Float]], viscosity: Float) -> [[Float]] {
guard frames.count > 1 else { return frames }
var result = frames
var prev = frames[0]
for i in 1..<frames.count {
var smoothed = [Float](repeating: 0, count: prev.count)
let n = min(prev.count, frames[i].count)
for j in 0..<n {
smoothed[j] = prev[j] + (frames[i][j] - prev[j]) * viscosity
}
result[i] = smoothed
prev = smoothed
}
return result
}
}