335 lines
14 KiB
Swift
335 lines
14 KiB
Swift
import Foundation
|
||
import AVFoundation
|
||
import Accelerate
|
||
|
||
/// Processes an entire audio file faster than real-time, producing per-frame level data
|
||
/// that can be cached and played back in sync with the audio.
|
||
/// Also optionally extracts SmartDJ profile data (silence boundaries + LUFS) in the same pass.
|
||
actor OfflineAudioAnalyzer {
|
||
static let shared = OfflineAudioAnalyzer()
|
||
|
||
typealias ProgressCallback = @Sendable (Float) -> Void
|
||
|
||
// MARK: - Combined Analysis Result
|
||
|
||
struct CombinedResult {
|
||
let visFrames: [[Float]]
|
||
let silenceEnd: Double? // leading silence end in seconds
|
||
let silenceStart: Double? // trailing silence start in seconds
|
||
let loudnessLUFS: Double? // approximate integrated loudness
|
||
}
|
||
|
||
// MARK: - Visualizer-only (legacy entry point)
|
||
|
||
func analyze(
|
||
url: URL,
|
||
pointsCount: Int = 20,
|
||
fps: Double = 30.0,
|
||
cutoff: Int = 90,
|
||
progress: ProgressCallback? = nil
|
||
) throws -> [[Float]] {
|
||
let r = try analyzeWithSmartDJ(url: url, pointsCount: pointsCount, fps: fps,
|
||
cutoff: cutoff, extractSmartDJ: false, progress: progress)
|
||
return r.visFrames
|
||
}
|
||
|
||
// MARK: - Combined pass: vis frames + SmartDJ profile in one file read
|
||
|
||
func analyzeWithSmartDJ(
|
||
url: URL,
|
||
pointsCount: Int = 20,
|
||
fps: Double = 30.0,
|
||
cutoff: Int = 90,
|
||
extractSmartDJ: Bool = true,
|
||
progress: ProgressCallback? = nil
|
||
) throws -> CombinedResult {
|
||
|
||
let file = try AVAudioFile(forReading: url)
|
||
let format = file.processingFormat
|
||
let sampleRate = format.sampleRate
|
||
let totalFrames = file.length
|
||
let durationSec = Double(totalFrames) / sampleRate
|
||
|
||
// FFT parameters — always 1024 regardless of fps
|
||
let fftSize = 1024
|
||
let halfSize = fftSize / 2
|
||
let log2n = vDSP_Length(log2(Double(fftSize)))
|
||
|
||
guard let fftSetup = vDSP_create_fftsetup(log2n, Int32(kFFTRadix2)) else {
|
||
throw NSError(domain: "OfflineAnalyzer", code: 2,
|
||
userInfo: [NSLocalizedDescriptionKey: "Failed to create FFT setup"])
|
||
}
|
||
defer { vDSP_destroy_fftsetup(fftSetup) }
|
||
|
||
// Hann window
|
||
var window = [Float](repeating: 0, count: fftSize)
|
||
vDSP_hann_window(&window, vDSP_Length(fftSize), Int32(vDSP_HANN_NORM))
|
||
|
||
// How many AUDIO samples between each vis frame.
|
||
// This is independent of fftSize — the FFT window always uses fftSize
|
||
// samples but hops forward by hopSize each frame, giving proper overlap
|
||
// when fps is high (hopSize < fftSize) without skipping samples when fps
|
||
// is low (hopSize > fftSize).
|
||
let hopSize = Int(max(1, sampleRate / fps))
|
||
|
||
// Read in chunks large enough to hold at least one full FFT window.
|
||
// Using 4× hopSize so we get several vis frames per disk read.
|
||
let readChunkSamples = max(fftSize * 2, hopSize * 4)
|
||
guard let readBuffer = AVAudioPCMBuffer(
|
||
pcmFormat: format,
|
||
frameCapacity: AVAudioFrameCount(readChunkSamples)
|
||
) else {
|
||
throw NSError(domain: "OfflineAnalyzer", code: 1,
|
||
userInfo: [NSLocalizedDescriptionKey: "Failed to create buffer"])
|
||
}
|
||
|
||
// Ring buffer: always holds the last `fftSize` samples plus one chunk ahead
|
||
let ringCapacity = readChunkSamples + fftSize
|
||
var ring = [Float](repeating: 0, count: ringCapacity)
|
||
var ringHead = 0 // next write position (mod ringCapacity)
|
||
var totalSamplesInRing = 0 // total samples ever written to ring
|
||
|
||
let estimatedFrames = Int(durationSec * fps) + 1
|
||
var rawFrames: [[Float]] = []
|
||
rawFrames.reserveCapacity(estimatedFrames)
|
||
|
||
// SmartDJ accumulators
|
||
let silenceThreshold: Float = 0.008
|
||
var leadingSilenceEndSec: Double? = nil
|
||
var trailingSilenceStartSec: Double? = nil
|
||
var sumSquares: Double = 0
|
||
var sampleCountLUFS: Int64 = 0
|
||
|
||
// Sliding window state
|
||
var nextFrameSample = 0 // the audio sample index at which to take the next vis frame
|
||
|
||
while file.framePosition < totalFrames {
|
||
// Cooperatively cancel if the app backgrounded mid-analysis
|
||
try Task.checkCancellation()
|
||
|
||
let toRead = min(AVAudioFrameCount(readChunkSamples),
|
||
AVAudioFrameCount(totalFrames - file.framePosition))
|
||
readBuffer.frameLength = 0
|
||
try file.read(into: readBuffer, frameCount: toRead)
|
||
let chunkStart = Int(file.framePosition) - Int(readBuffer.frameLength)
|
||
let chunkLen = Int(readBuffer.frameLength)
|
||
guard chunkLen > 0, let ch = readBuffer.floatChannelData?[0] else { continue }
|
||
|
||
// Write chunk into ring buffer
|
||
for i in 0..<chunkLen {
|
||
ring[ringHead] = ch[i]
|
||
ringHead = (ringHead + 1) % ringCapacity
|
||
}
|
||
totalSamplesInRing += chunkLen
|
||
|
||
// SmartDJ RMS & loudness over this chunk
|
||
if extractSmartDJ {
|
||
var rms: Float = 0
|
||
vDSP_rmsqv(ch, 1, &rms, vDSP_Length(chunkLen))
|
||
let chunkSec = Double(chunkStart) / sampleRate
|
||
if rms > silenceThreshold {
|
||
if leadingSilenceEndSec == nil { leadingSilenceEndSec = chunkSec }
|
||
trailingSilenceStartSec = chunkSec + Double(chunkLen) / sampleRate
|
||
}
|
||
var sumSq: Float = 0
|
||
vDSP_measqv(ch, 1, &sumSq, vDSP_Length(chunkLen))
|
||
sumSquares += Double(sumSq) * Double(chunkLen)
|
||
sampleCountLUFS += Int64(chunkLen)
|
||
}
|
||
|
||
// Generate vis frames for all frame positions inside this chunk
|
||
let chunkEnd = chunkStart + chunkLen
|
||
|
||
while nextFrameSample < chunkEnd {
|
||
// Check cancellation every frame — the inner loop is the hot path
|
||
try Task.checkCancellation()
|
||
// We need fftSize samples ending at nextFrameSample + fftSize/2
|
||
// (centre the FFT window on the frame position for better transient response)
|
||
let windowStart = nextFrameSample - fftSize / 2
|
||
let windowEnd = windowStart + fftSize
|
||
|
||
// Skip if we don't have enough samples yet
|
||
guard windowEnd <= Int(file.framePosition) else { break }
|
||
guard windowStart >= 0 else {
|
||
nextFrameSample += hopSize
|
||
continue
|
||
}
|
||
|
||
// Extract fftSize samples from ring buffer
|
||
// The ring buffer contains samples [totalSamplesInRing-ringCapacity ... totalSamplesInRing]
|
||
// (clamped to what we've written so far)
|
||
let ringTail = totalSamplesInRing - ringCapacity
|
||
guard windowStart >= ringTail else {
|
||
nextFrameSample += hopSize
|
||
continue
|
||
}
|
||
|
||
var windowSamples = [Float](repeating: 0, count: fftSize)
|
||
for j in 0..<fftSize {
|
||
let absIdx = windowStart + j
|
||
let ringIdx = (ringHead - (totalSamplesInRing - absIdx) + ringCapacity * 1000) % ringCapacity
|
||
windowSamples[j] = ring[ringIdx]
|
||
}
|
||
|
||
let frame = computeFFTFrame(
|
||
samples: windowSamples,
|
||
fftSize: fftSize, halfSize: halfSize,
|
||
window: window, fftSetup: fftSetup,
|
||
pointsCount: pointsCount, cutoff: cutoff
|
||
)
|
||
rawFrames.append(frame)
|
||
|
||
nextFrameSample += hopSize
|
||
|
||
if rawFrames.count % 100 == 0 {
|
||
progress?(Float(nextFrameSample) / Float(totalFrames))
|
||
}
|
||
}
|
||
}
|
||
progress?(1.0)
|
||
|
||
// ── Normalize frames to 0–1 using 95th-percentile peak ──────────────
|
||
// This ensures consistent amplitude across songs regardless of mastering
|
||
// level, and makes pre-analyzed playback feel identical to the live FFT
|
||
// path which also normalises against a peak follower.
|
||
let normalized = normalizeFrames(rawFrames)
|
||
|
||
// ── Apply per-frame temporal smoothing ───────────────────────────────
|
||
// The live path smooths in updateDisplayLevels. Pre-analyzed frames need
|
||
// the same treatment baked in so playback isn't jittery.
|
||
let viscosity: Float = 0.25 // matches default VisualizerSettings.viscosity
|
||
let smoothed = smoothFrames(normalized, viscosity: viscosity)
|
||
|
||
// ── LUFS ─────────────────────────────────────────────────────────────
|
||
var loudnessLUFS: Double? = nil
|
||
if extractSmartDJ && sampleCountLUFS > 0 {
|
||
let meanSquare = sumSquares / Double(sampleCountLUFS)
|
||
if meanSquare > 0 {
|
||
loudnessLUFS = 20.0 * log10(sqrt(meanSquare))
|
||
}
|
||
}
|
||
|
||
// ── Silence guard ────────────────────────────────────────────────────
|
||
let safeLeading: Double? = {
|
||
guard let t = leadingSilenceEndSec, t > 0.05, t < durationSec * 0.25 else { return nil }
|
||
return t
|
||
}()
|
||
let safeTrailing: Double? = {
|
||
guard let t = trailingSilenceStartSec,
|
||
t < durationSec - 0.5, t > durationSec * 0.5 else { return nil }
|
||
return t
|
||
}()
|
||
|
||
return CombinedResult(
|
||
visFrames: smoothed,
|
||
silenceEnd: safeLeading,
|
||
silenceStart: safeTrailing,
|
||
loudnessLUFS: loudnessLUFS
|
||
)
|
||
}
|
||
|
||
// MARK: - FFT Frame
|
||
|
||
private func computeFFTFrame(
|
||
samples: [Float],
|
||
fftSize: Int,
|
||
halfSize: Int,
|
||
window: [Float],
|
||
fftSetup: FFTSetup,
|
||
pointsCount: Int,
|
||
cutoff: Int
|
||
) -> [Float] {
|
||
// Apply Hann window
|
||
var windowed = [Float](repeating: 0, count: fftSize)
|
||
vDSP_vmul(samples, 1, window, 1, &windowed, 1, vDSP_Length(fftSize))
|
||
|
||
// FFT
|
||
var realp = [Float](repeating: 0, count: halfSize)
|
||
var imagp = [Float](repeating: 0, count: halfSize)
|
||
var magnitudes = [Float](repeating: 0, count: halfSize)
|
||
|
||
realp.withUnsafeMutableBufferPointer { rb in
|
||
imagp.withUnsafeMutableBufferPointer { ib in
|
||
var sc = DSPSplitComplex(realp: rb.baseAddress!, imagp: ib.baseAddress!)
|
||
windowed.withUnsafeBytes { raw in
|
||
vDSP_ctoz(raw.bindMemory(to: DSPComplex.self).baseAddress!,
|
||
2, &sc, 1, vDSP_Length(halfSize))
|
||
}
|
||
let log2n = vDSP_Length(log2(Double(fftSize)))
|
||
vDSP_fft_zrip(fftSetup, &sc, 1, log2n, FFTDirection(FFT_FORWARD))
|
||
vDSP_zvmags(&sc, 1, &magnitudes, 1, vDSP_Length(halfSize))
|
||
}
|
||
}
|
||
|
||
// Normalize: divide by N², then sqrt for perceptual amplitude
|
||
let n2 = Float(fftSize) * Float(fftSize)
|
||
var scale = 1.0 / n2
|
||
vDSP_vsmul(magnitudes, 1, &scale, &magnitudes, 1, vDSP_Length(halfSize))
|
||
for i in 0..<halfSize { magnitudes[i] = sqrt(magnitudes[i]) }
|
||
|
||
// Log-spaced binning — uniform bin width, matching the fixed live-path binning.
|
||
// No eqBoost: frequency bands are weighted equally, matching the original Mitsuha behaviour.
|
||
var frame = [Float](repeating: 0, count: pointsCount)
|
||
let maxBin = min(halfSize - 1, cutoff)
|
||
let uniformBinWidth = max(1, maxBin / pointsCount)
|
||
|
||
for i in 0..<pointsCount {
|
||
let nIdx = Float(i + 1) / Float(pointsCount)
|
||
let logIdx = log10(nIdx * 9.0 + 1.0) // 0→1 log-spaced
|
||
let centerBin = Int(logIdx * Float(maxBin))
|
||
let lo = max(1, centerBin - uniformBinWidth / 2)
|
||
let hi = min(maxBin, centerBin + uniformBinWidth / 2)
|
||
var sum: Float = 0
|
||
var count = 0
|
||
for j in lo...hi where j < magnitudes.count {
|
||
sum += magnitudes[j]
|
||
count += 1
|
||
}
|
||
frame[i] = count > 0 ? sum / Float(count) : 0
|
||
}
|
||
return frame
|
||
}
|
||
|
||
// MARK: - Post-processing
|
||
|
||
/// Normalize all frames so the 95th-percentile peak maps to 0.8.
|
||
/// This keeps loud transients visible without clipping, and ensures a quiet
|
||
/// song fills the visualizer at the same apparent height as a loud one.
|
||
private func normalizeFrames(_ frames: [[Float]]) -> [[Float]] {
|
||
guard !frames.isEmpty else { return frames }
|
||
|
||
// Collect all non-zero values to find the 95th percentile
|
||
var allValues: [Float] = []
|
||
allValues.reserveCapacity(frames.count * (frames.first?.count ?? 1))
|
||
for frame in frames {
|
||
for v in frame where v > 0 { allValues.append(v) }
|
||
}
|
||
guard !allValues.isEmpty else { return frames }
|
||
|
||
allValues.sort()
|
||
let p95idx = min(Int(Float(allValues.count) * 0.95), allValues.count - 1)
|
||
let p95 = allValues[p95idx]
|
||
guard p95 > 0 else { return frames }
|
||
|
||
let scale = 0.8 / p95
|
||
return frames.map { frame in frame.map { min(1.0, $0 * scale) } }
|
||
}
|
||
|
||
/// Bake temporal smoothing into the frames so pre-analyzed playback
|
||
/// looks identical to the live FFT path (which smooths in updateDisplayLevels).
|
||
private func smoothFrames(_ frames: [[Float]], viscosity: Float) -> [[Float]] {
|
||
guard frames.count > 1 else { return frames }
|
||
var result = frames
|
||
var prev = frames[0]
|
||
for i in 1..<frames.count {
|
||
var smoothed = [Float](repeating: 0, count: prev.count)
|
||
let n = min(prev.count, frames[i].count)
|
||
for j in 0..<n {
|
||
smoothed[j] = prev[j] + (frames[i][j] - prev[j]) * viscosity
|
||
}
|
||
result[i] = smoothed
|
||
prev = smoothed
|
||
}
|
||
return result
|
||
}
|
||
}
|