NavidromeApp/iOS/Data/AudioTapProcessor.swift

422 lines
16 KiB
Swift

import Foundation
import AVFoundation
import MediaToolbox
import Accelerate
// MARK: - Lock-free Ring Buffer for audio samples
/// Single-producer (audio render thread), single-consumer (main thread) ring buffer.
/// No locks, no allocations on the audio thread. ARM64 naturally-atomic Int writes
/// ensure the single write index is safely visible across threads.
final class PCMRingBuffer {
let capacity: Int
private let buffer: UnsafeMutablePointer<Float>
// Atomic indices render thread writes `writeIndex`, main thread writes `readIndex`
private var _writeIndex: Int = 0
init(capacity: Int) {
self.capacity = capacity
buffer = .allocate(capacity: capacity)
buffer.initialize(repeating: 0, count: capacity)
}
deinit {
buffer.deallocate()
}
/// Write samples from the audio render thread. Lock-free.
func write(_ samples: UnsafePointer<Float>, count: Int) {
let wi = _writeIndex
let space = capacity
for i in 0..<count {
buffer[(wi + i) % space] = samples[i]
}
_writeIndex = (wi + count) % space
}
/// Read the most recent `count` samples from the main thread. Lock-free.
/// Returns the number of samples actually copied.
@discardableResult
func readMostRecent(into dest: UnsafeMutablePointer<Float>, count: Int) -> Int {
let wi = _writeIndex
let start = (wi - count + capacity) % capacity
for i in 0..<count {
dest[i] = buffer[(start + i) % capacity]
}
return count
}
func reset() {
_writeIndex = 0
buffer.initialize(repeating: 0, count: capacity)
}
}
// MARK: - Audio Tap Processor
/// Installs an MTAudioProcessingTap on an AVPlayerItem and makes raw PCM samples
/// available for FFT visualization and Shazam recognition.
///
/// Architecture:
/// AVPlayerItem MTAudioProcessingTap (C callback on render thread)
/// PCMRingBuffer (lock-free)
/// Timer at 30fps reads buffer, runs vDSP FFT 30 bands setLevels()
/// Optional: Shazam consumer subscribes via `shazamHandler`
///
/// Thread safety:
/// - Tap callback: CoreAudio real-time render thread (no locks, no ObjC, no heap alloc)
/// - FFT timer: main thread
/// - Ring buffer: lock-free single-producer/single-consumer
final class AudioTapProcessor {
static let shared = AudioTapProcessor()
// Ring buffer: 8192 samples 186ms at 44.1kHz plenty for 1024-sample FFT windows
let ringBuffer = PCMRingBuffer(capacity: 8192)
// Shazam consumer set by ShazamRecognizer, cleared when done
var shazamHandler: ((UnsafeMutablePointer<AudioBufferList>, CMItemCount) -> Void)?
// Pre-allocated FFT resources created once, reused every frame (30fps)
private let fftSize = 1024
private let fftLog2n: vDSP_Length
private let fftSetup: FFTSetup
private var hannWindow: [Float]
private var fftTimeDomain: [Float]
private var fftRealp: [Float]
private var fftImagp: [Float]
private var fftMagnitudes: [Float]
// Debug: save PCM to WAV file for verification tap the share button in settings
var debugDumpEnabled = false
var debugDumpURL: URL?
private var debugFileHandle: FileHandle?
private var debugSamplesWritten: Int = 0
private var debugMaxSamplesActual = 44100 * 5 // recalculated in startDebugDump from actual rate
/// Posted on main thread when debug capture completes. userInfo contains "url": URL.
static let captureCompleteNotification = Notification.Name("AudioTapCaptureComplete")
// Source format detected by the tap's prepare callback
var sourceFormat: AVAudioFormat?
private init() {
let halfSize = fftSize / 2
fftLog2n = vDSP_Length(log2(Float(fftSize)))
fftSetup = vDSP_create_fftsetup(fftLog2n, FFTRadix(kFFTRadix2))!
hannWindow = [Float](repeating: 0, count: fftSize)
vDSP_hann_window(&hannWindow, vDSP_Length(fftSize), Int32(vDSP_HANN_NORM))
fftTimeDomain = [Float](repeating: 0, count: fftSize)
fftRealp = [Float](repeating: 0, count: halfSize)
fftImagp = [Float](repeating: 0, count: halfSize)
fftMagnitudes = [Float](repeating: 0, count: halfSize)
}
deinit {
vDSP_destroy_fftsetup(fftSetup)
}
// MARK: - Install / Remove Tap
/// Install the shared tap on a player item. Returns true if successful.
/// Safe to call multiple times removes any existing tap first.
@MainActor
func installTap(on playerItem: AVPlayerItem) async -> Bool {
// Remove existing tap
removeTap(from: playerItem)
// Load audio tracks async API (non-deprecated)
guard let audioTrack = try? await playerItem.asset
.loadTracks(withMediaType: .audio).first else {
print("[AudioTap] No audio track found on playerItem")
return false
}
// Create the MTAudioProcessingTap with C callbacks
var callbacks = MTAudioProcessingTapCallbacks(
version: kMTAudioProcessingTapCallbacksVersion_0,
clientInfo: UnsafeMutableRawPointer(Unmanaged.passUnretained(self).toOpaque()),
init: tapInit,
finalize: nil,
prepare: tapPrepare,
unprepare: nil,
process: tapProcess
)
var tapOut: MTAudioProcessingTap?
let status = MTAudioProcessingTapCreate(
kCFAllocatorDefault, &callbacks,
kMTAudioProcessingTapCreationFlag_PostEffects, &tapOut
)
guard status == noErr, let tap = tapOut else {
print("[AudioTap] MTAudioProcessingTapCreate failed: \(status)")
return false
}
let inputParams = AVMutableAudioMixInputParameters(track: audioTrack)
inputParams.audioTapProcessor = tap
let mix = AVMutableAudioMix()
mix.inputParameters = [inputParams]
playerItem.audioMix = mix
print("[AudioTap] Tap installed successfully")
// Start debug dump if enabled
if debugDumpEnabled {
startDebugDump()
}
return true
}
/// Remove the tap from a player item.
func removeTap(from playerItem: AVPlayerItem?) {
playerItem?.audioMix = nil
sourceFormat = nil
ringBuffer.reset() // Prevent stale samples from previous stream bleeding into next FFT
stopDebugDump()
}
// MARK: - FFT Processing (called from main thread timer)
/// Perform FFT on the most recent samples and return frequency bands (0.0-1.0).
/// Uses pre-allocated buffers minimal heap allocation per call.
/// Call this at 30fps from the visualizer timer.
func computeFFTBands(bandCount: Int = 30) -> [Float] {
let halfSize = fftSize / 2
// Read most recent 1024 samples from ring buffer into pre-allocated array
_ = fftTimeDomain.withUnsafeMutableBufferPointer { buf in
ringBuffer.readMostRecent(into: buf.baseAddress!, count: fftSize)
}
// Apply Hann window (pre-computed) to reduce spectral leakage
vDSP_vmul(fftTimeDomain, 1, hannWindow, 1, &fftTimeDomain, 1, vDSP_Length(fftSize))
// Zero the split complex buffers
for i in 0..<halfSize { fftRealp[i] = 0; fftImagp[i] = 0 }
// Run FFT using withUnsafeMutableBufferPointer to get stable pointers
// that outlive the DSPSplitComplex init call.
fftRealp.withUnsafeMutableBufferPointer { realpBuf in
fftImagp.withUnsafeMutableBufferPointer { imagpBuf in
var splitComplex = DSPSplitComplex(
realp: realpBuf.baseAddress!,
imagp: imagpBuf.baseAddress!
)
fftTimeDomain.withUnsafeBufferPointer { ptr in
ptr.baseAddress!.withMemoryRebound(to: DSPComplex.self, capacity: halfSize) {
vDSP_ctoz($0, 2, &splitComplex, 1, vDSP_Length(halfSize))
}
}
vDSP_fft_zrip(fftSetup, &splitComplex, 1, fftLog2n, FFTDirection(kFFTDirection_Forward))
vDSP_zvmags(&splitComplex, 1, &fftMagnitudes, 1, vDSP_Length(halfSize))
}
}
// Convert to dB scale magnitude 0 -inf, clamped to 0 in normalization step
var one: Float = 1.0
vDSP_vdbcon(fftMagnitudes, 1, &one, &fftMagnitudes, 1, vDSP_Length(halfSize), 1)
// Map to frequency bands with logarithmic spacing (more bass/mid resolution)
var bands = [Float](repeating: 0, count: bandCount)
let maxBin = min(halfSize, 300) // Cap at ~13kHz (300/512 * 22050)
for i in 0..<bandCount {
let lowPct = Float(i) / Float(bandCount)
let highPct = Float(i + 1) / Float(bandCount)
let lowBin = Int(powf(lowPct, 2.0) * Float(maxBin))
let highBin = max(lowBin + 1, Int(powf(highPct, 2.0) * Float(maxBin)))
let clampedHigh = min(highBin, maxBin)
if lowBin < clampedHigh {
var sum: Float = 0
var count: Float = 0
for bin in lowBin..<clampedHigh {
sum += fftMagnitudes[bin]
count += 1
}
bands[i] = sum / count
}
}
// Normalize: map dB range to 0.0-1.0
let minDB: Float = -50
let maxDB: Float = 15
let range = maxDB - minDB
for i in 0..<bandCount {
bands[i] = max(0, min(1, (bands[i] - minDB) / range))
}
return bands
}
// MARK: - Debug Dump (WAV file)
/// Start capturing audio tap output to a WAV file. Captures 5 seconds then auto-stops.
func startDebugDump() {
stopDebugDump()
let url = FileManager.default.temporaryDirectory
.appendingPathComponent("audio_tap_capture_\(Int(Date().timeIntervalSince1970)).wav")
debugDumpURL = url
// Use the actual stream sample rate, default 44100 if unknown
let sampleRate = UInt32(sourceFormat?.sampleRate ?? 44100)
debugMaxSamplesActual = Int(sampleRate) * 5 // 5 seconds at actual rate
// Write WAV header placeholder (44 bytes) we'll patch the size fields when done
var header = Data(count: 44)
header.withUnsafeMutableBytes { ptr in
let p = ptr.baseAddress!.assumingMemoryBound(to: UInt8.self)
// "RIFF"
p[0] = 0x52; p[1] = 0x49; p[2] = 0x46; p[3] = 0x46
// File size placeholder (patch later)
// "WAVE"
p[8] = 0x57; p[9] = 0x41; p[10] = 0x56; p[11] = 0x45
// "fmt "
p[12] = 0x66; p[13] = 0x6D; p[14] = 0x74; p[15] = 0x20
// Chunk size: 16
p[16] = 16; p[17] = 0; p[18] = 0; p[19] = 0
// Format: IEEE float (3)
p[20] = 3; p[21] = 0
// Channels: 1
p[22] = 1; p[23] = 0
// Sample rate (from actual stream format)
let sr = sampleRate
p[24] = UInt8(sr & 0xFF); p[25] = UInt8((sr >> 8) & 0xFF)
p[26] = UInt8((sr >> 16) & 0xFF); p[27] = UInt8((sr >> 24) & 0xFF)
// Byte rate: sampleRate * 1ch * 4 bytes
let br = sr * 4
p[28] = UInt8(br & 0xFF); p[29] = UInt8((br >> 8) & 0xFF)
p[30] = UInt8((br >> 16) & 0xFF); p[31] = UInt8((br >> 24) & 0xFF)
// Block align: 4
p[32] = 4; p[33] = 0
// Bits per sample: 32
p[34] = 32; p[35] = 0
// "data"
p[36] = 0x64; p[37] = 0x61; p[38] = 0x74; p[39] = 0x61
// Data size placeholder (patch later)
}
FileManager.default.createFile(atPath: url.path, contents: header)
debugFileHandle = FileHandle(forWritingAtPath: url.path)
debugFileHandle?.seekToEndOfFile()
debugSamplesWritten = 0
debugDumpEnabled = true
print("[AudioTap] Debug capture started: \(url.lastPathComponent) at \(sampleRate)Hz")
}
/// Stop capturing and finalize the WAV header with correct sizes.
func stopDebugDump() {
guard let fh = debugFileHandle else { return }
debugDumpEnabled = false
// Patch WAV header with correct sizes
let dataSize = UInt32(debugSamplesWritten * MemoryLayout<Float>.size)
let fileSize = dataSize + 36 // 44 - 8 = 36
fh.seek(toFileOffset: 4)
var fs = fileSize; fh.write(Data(bytes: &fs, count: 4))
fh.seek(toFileOffset: 40)
var ds = dataSize; fh.write(Data(bytes: &ds, count: 4))
fh.closeFile()
debugFileHandle = nil
if debugSamplesWritten > 0 {
let rate = sourceFormat?.sampleRate ?? 44100
let duration = Double(debugSamplesWritten) / rate
print("[AudioTap] Debug capture complete: \(String(format: "%.1f", duration))s, \(dataSize) bytes")
}
debugSamplesWritten = 0
}
/// Called from the tap process callback to write samples to WAV file.
func debugWriteSamples(_ samples: UnsafePointer<Float>, count: Int) {
guard debugDumpEnabled, debugSamplesWritten < debugMaxSamplesActual else {
if debugDumpEnabled && debugSamplesWritten >= debugMaxSamplesActual {
// Auto-stop after 5 seconds
debugDumpEnabled = false
DispatchQueue.main.async { [weak self] in
self?.stopDebugDump()
if let url = self?.debugDumpURL {
NotificationCenter.default.post(
name: AudioTapProcessor.captureCompleteNotification,
object: nil,
userInfo: ["url": url]
)
}
}
}
return
}
let data = Data(bytes: samples, count: count * MemoryLayout<Float>.size)
debugFileHandle?.write(data)
debugSamplesWritten += count
}
}
// MARK: - C Tap Callbacks (free functions, not methods)
private func tapInit(
tap: MTAudioProcessingTap,
clientInfo: UnsafeMutableRawPointer?,
tapStorageOut: UnsafeMutablePointer<UnsafeMutableRawPointer?>
) {
tapStorageOut.pointee = clientInfo
}
private func tapPrepare(
tap: MTAudioProcessingTap,
maxFrames: CMItemCount,
processingFormat: UnsafePointer<AudioStreamBasicDescription>
) {
let processor = Unmanaged<AudioTapProcessor>
.fromOpaque(MTAudioProcessingTapGetStorage(tap))
.takeUnretainedValue()
let format = AVAudioFormat(streamDescription: processingFormat)
processor.sourceFormat = format
print("[AudioTap] Prepared: \(processingFormat.pointee.mSampleRate)Hz, " +
"\(processingFormat.pointee.mChannelsPerFrame)ch, " +
"\(processingFormat.pointee.mBitsPerChannel)bit, " +
"float=\(processingFormat.pointee.mFormatFlags & kAudioFormatFlagIsFloat != 0)")
}
private func tapProcess(
tap: MTAudioProcessingTap,
numberFrames: CMItemCount,
flags: MTAudioProcessingTapFlags,
bufferListInOut: UnsafeMutablePointer<AudioBufferList>,
numberFramesOut: UnsafeMutablePointer<CMItemCount>,
flagsOut: UnsafeMutablePointer<MTAudioProcessingTapFlags>
) {
// Fetch audio from the source passes through to player unchanged
let status = MTAudioProcessingTapGetSourceAudio(
tap, numberFrames, bufferListInOut, flagsOut, nil, numberFramesOut
)
guard status == noErr else { return }
let processor = Unmanaged<AudioTapProcessor>
.fromOpaque(MTAudioProcessingTapGetStorage(tap))
.takeUnretainedValue()
// Extract mono float samples from the first channel
let abl = UnsafeMutableAudioBufferListPointer(bufferListInOut)
guard let firstBuffer = abl.first,
let data = firstBuffer.mData else { return }
let floatPtr = data.assumingMemoryBound(to: Float.self)
let frameCount = Int(numberFramesOut.pointee)
// Write to ring buffer (lock-free, no allocation)
processor.ringBuffer.write(floatPtr, count: frameCount)
// Forward to Shazam handler if active
processor.shazamHandler?(bufferListInOut, numberFramesOut.pointee)
// Debug dump if enabled
if processor.debugDumpEnabled {
processor.debugWriteSamples(floatPtr, count: frameCount)
}
}