NavidromeApp/iOS/Views/Visualizer/OfflineAudioAnalyzer.swift

257 lines
11 KiB
Swift

import Foundation
import AVFoundation
import Accelerate
/// Processes an entire audio file faster than real-time, producing per-frame FFT data
/// that can be cached and played back in sync with the audio.
/// Also optionally extracts SmartDJ profile data (silence boundaries + LUFS) in the same pass.
actor OfflineAudioAnalyzer {
static let shared = OfflineAudioAnalyzer()
typealias ProgressCallback = @Sendable (Float) -> Void
// MARK: - Combined Analysis Result
struct CombinedResult {
let visFrames: [[Float]]
let silenceEnd: Double? // leading silence end in seconds
let silenceStart: Double? // trailing silence start in seconds
let loudnessLUFS: Double? // approximate integrated loudness
}
// MARK: - Visualizer-only (legacy entry point)
func analyze(
url: URL,
pointsCount: Int = 20,
fps: Double = 30.0,
cutoff: Int = 90,
eqBoostFactor: Float = 3.5,
progress: ProgressCallback? = nil
) throws -> [[Float]] {
let r = try analyzeWithSmartDJ(url: url, pointsCount: pointsCount, fps: fps,
cutoff: cutoff, eqBoostFactor: eqBoostFactor,
extractSmartDJ: false, progress: progress)
return r.visFrames
}
// MARK: - Combined pass: vis frames + SmartDJ profile in one file read
/// Reads the file once, producing visualiser frames AND silence/loudness data.
/// Set `extractSmartDJ: false` to skip the SmartDJ computation and save time.
func analyzeWithSmartDJ(
url: URL,
pointsCount: Int = 20,
fps: Double = 30.0,
cutoff: Int = 90,
eqBoostFactor: Float = 3.5,
extractSmartDJ: Bool = true,
progress: ProgressCallback? = nil
) throws -> CombinedResult {
let file = try AVAudioFile(forReading: url)
let format = file.processingFormat
let sampleRate = format.sampleRate
let totalFrames = file.length
let durationSec = Double(totalFrames) / sampleRate
let audioFramesPerVisFrame = AVAudioFrameCount(sampleRate / fps)
let fftSize = 1024
let bufferSize = max(AVAudioFrameCount(fftSize), audioFramesPerVisFrame)
guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: bufferSize) else {
throw NSError(domain: "OfflineAnalyzer", code: 1, userInfo: [NSLocalizedDescriptionKey: "Failed to create buffer"])
}
let log2n = vDSP_Length(log2(Double(fftSize)))
guard let fftSetup = vDSP_create_fftsetup(log2n, Int32(kFFTRadix2)) else {
throw NSError(domain: "OfflineAnalyzer", code: 2, userInfo: [NSLocalizedDescriptionKey: "Failed to create FFT setup"])
}
defer { vDSP_destroy_fftsetup(fftSetup) }
let halfSize = fftSize / 2
var visualizerData: [[Float]] = []
let estimatedVisFrames = Int(Double(totalFrames) / Double(audioFramesPerVisFrame))
visualizerData.reserveCapacity(estimatedVisFrames)
var window = [Float](repeating: 0, count: fftSize)
vDSP_hann_window(&window, vDSP_Length(fftSize), Int32(vDSP_HANN_NORM))
// SmartDJ state
let silenceThreshold: Float = 0.008 // RMS below this = silence
var leadingSilenceEndSec: Double? = nil // first non-silent moment
var trailingSilenceStartSec: Double? = nil // last non-silent moment
var sumSquares: Double = 0.0
var sampleCount: Int64 = 0
var filePositionSec: Double { Double(file.framePosition) / sampleRate }
var frameIndex = 0
while file.framePosition < totalFrames {
let framesToRead = min(bufferSize, AVAudioFrameCount(totalFrames - file.framePosition))
buffer.frameLength = 0
try file.read(into: buffer, frameCount: framesToRead)
guard let channelData = buffer.floatChannelData?[0] else { continue }
let actualFrames = Int(buffer.frameLength)
let chunkStartSec = filePositionSec - Double(actualFrames) / sampleRate
// SmartDJ: RMS per chunk
if extractSmartDJ && actualFrames > 0 {
var rms: Float = 0
vDSP_rmsqv(channelData, 1, &rms, vDSP_Length(actualFrames))
if rms > silenceThreshold {
if leadingSilenceEndSec == nil {
leadingSilenceEndSec = chunkStartSec
}
trailingSilenceStartSec = chunkStartSec + Double(actualFrames) / sampleRate
}
// Accumulate for integrated loudness
var sumSq: Float = 0
vDSP_measqv(channelData, 1, &sumSq, vDSP_Length(actualFrames))
sumSquares += Double(sumSq) * Double(actualFrames)
sampleCount += Int64(actualFrames)
}
// Visualiser FFT frames
guard actualFrames >= fftSize else {
if actualFrames > 0 {
visualizerData.append(processFFTFrame(
channelData: channelData, frameCount: actualFrames,
fftSize: fftSize, halfSize: halfSize, window: window,
fftSetup: fftSetup, pointsCount: pointsCount,
cutoff: cutoff, eqBoostFactor: eqBoostFactor))
}
break
}
var sampleOffset = 0
while sampleOffset + fftSize <= actualFrames {
visualizerData.append(processFFTFrame(
channelData: channelData.advanced(by: sampleOffset),
frameCount: fftSize, fftSize: fftSize, halfSize: halfSize,
window: window, fftSetup: fftSetup, pointsCount: pointsCount,
cutoff: cutoff, eqBoostFactor: eqBoostFactor))
sampleOffset += Int(audioFramesPerVisFrame)
frameIndex += 1
if frameIndex % 50 == 0 {
progress?(Float(file.framePosition) / Float(totalFrames))
}
}
}
progress?(1.0)
// Compute approximate integrated LUFS
// Uses mean square dBFS as a simplified approximation of BS.1770.
// Not true K-weighted LUFS but accurate enough for volume normalisation.
var loudnessLUFS: Double? = nil
if extractSmartDJ && sampleCount > 0 {
let meanSquare = sumSquares / Double(sampleCount)
if meanSquare > 0 {
let lufs = 20.0 * log10(sqrt(meanSquare))
loudnessLUFS = lufs
}
}
// Guard silence detections: must be within plausible range
let safeLeading: Double? = {
guard let t = leadingSilenceEndSec, t > 0.05, t < durationSec * 0.25 else { return nil }
return t
}()
let safeTrailing: Double? = {
guard let t = trailingSilenceStartSec, t < durationSec - 0.5, t > durationSec * 0.5 else { return nil }
return t
}()
return CombinedResult(
visFrames: visualizerData,
silenceEnd: safeLeading,
silenceStart: safeTrailing,
loudnessLUFS: loudnessLUFS
)
}
/// Process a single FFT frame from raw audio samples
private func processFFTFrame(
channelData: UnsafePointer<Float>,
frameCount: Int,
fftSize: Int,
halfSize: Int,
window: [Float],
fftSetup: FFTSetup,
pointsCount: Int,
cutoff: Int,
eqBoostFactor: Float
) -> [Float] {
let n = min(frameCount, fftSize)
// 1. Apply Hann window
var windowed = [Float](repeating: 0, count: fftSize)
if n < fftSize {
// Zero-pad if short
for i in 0..<n { windowed[i] = channelData[i] * window[i] }
} else {
vDSP_vmul(channelData, 1, window, 1, &windowed, 1, vDSP_Length(fftSize))
}
// 2. FFT
var realp = [Float](repeating: 0, count: halfSize)
var imagp = [Float](repeating: 0, count: halfSize)
var magnitudes = [Float](repeating: 0, count: halfSize)
realp.withUnsafeMutableBufferPointer { realpBuf in
imagp.withUnsafeMutableBufferPointer { imagpBuf in
var splitComplex = DSPSplitComplex(
realp: realpBuf.baseAddress!,
imagp: imagpBuf.baseAddress!
)
windowed.withUnsafeBytes { raw in
let ptr = raw.bindMemory(to: DSPComplex.self).baseAddress!
vDSP_ctoz(ptr, 2, &splitComplex, 1, vDSP_Length(halfSize))
}
vDSP_fft_zrip(fftSetup, &splitComplex, 1, vDSP_Length(log2(Double(fftSize))), FFTDirection(FFT_FORWARD))
vDSP_zvmags(&splitComplex, 1, &magnitudes, 1, vDSP_Length(halfSize))
}
}
// 3. Normalize
let fftSizeF = Float(fftSize)
var scale: Float = 1.0 / (fftSizeF * fftSizeF)
vDSP_vsmul(magnitudes, 1, &scale, &magnitudes, 1, vDSP_Length(halfSize))
// sqrt for perceptual amplitude
for i in 0..<halfSize {
magnitudes[i] = sqrt(magnitudes[i])
}
// 4. Logarithmic binning with EQ boost
var framePoints = [Float](repeating: 0, count: pointsCount)
let maxUsefulBin = min(halfSize - 1, cutoff)
for i in 0..<pointsCount {
let normalizedIndex = Float(i + 1) / Float(pointsCount)
let logIndex = log10(normalizedIndex * 9.0 + 1.0)
let centerBin = logIndex * Float(maxUsefulBin)
let binWidth = max(1.0, Float(maxUsefulBin) / Float(pointsCount) * logIndex)
let startBin = max(1, Int(centerBin - binWidth / 2))
let endBin = min(maxUsefulBin, Int(centerBin + binWidth / 2))
var sum: Float = 0
var countInBand = 0
for j in startBin...endBin where j < magnitudes.count {
sum += magnitudes[j]
countInBand += 1
}
let average = countInBand > 0 ? (sum / Float(countInBand)) : 0
let eqBoost: Float = 1.0 + (Float(i) / Float(pointsCount)) * eqBoostFactor
framePoints[i] = average * eqBoost
}
return framePoints
}
}