-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathEncoderDecoderAudio.py
More file actions
143 lines (123 loc) · 4.59 KB
/
EncoderDecoderAudio.py
File metadata and controls
143 lines (123 loc) · 4.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import wave, struct
import keras
import noisereduce as nr
import numpy as np
def dataFromWave(fname):
"""
Reads a wav file to samples
"""
f = wave.open(fname, 'rb')
# Read Channel Number
chans = f.getnchannels()
# Get raw sample count
samps = f.getnframes()
# Get bit-width of samples
sampwidth = f.getsampwidth()
# Get sampling rate
rate = f.getframerate()
# Read samples
if sampwidth == 3: # have to read this one sample at a time
s = ''
for k in range(samps):
fr = f.readframes(1)
for c in range(0, 3 * chans, 3):
s += '\0' + fr[c:(c + 3)] # put TRAILING 0 to make 32-bit (file is little-endian)
else:
s = f.readframes(samps)
f.close()
# Unpack samples
unpstr = '<{0}{1}'.format(samps * chans, {1: 'b', 2: 'h', 3: 'i', 4: 'i', 8: 'q'}[sampwidth])
x = list(struct.unpack(unpstr, s))
if sampwidth == 3:
x = [k >> 8 for k in x] # downshift to get +/- 2^24 with sign extension
return x, chans, samps, sampwidth, rate
def dataToWave(fname, data, chans, samps, sampwidth, rate):
"""
Writes samples to a wav file
"""
obj = wave.open(fname, 'wb')
# Set parameters
obj.setnchannels(chans)
obj.setsampwidth(sampwidth)
obj.setframerate(rate)
# set up the packaging format
packstr = "<{0}".format({1: 'b', 2: 'h', 3: 'i', 4: 'i', 8: 'q'}[sampwidth])
# Package the samples
for i in range(samps * chans):
obj.writeframesraw(struct.pack(packstr, data[i]))
obj.close()
def norm(x):
"""
NN output isn't quite perfect, make sure it's bounded
"""
# If we're outside allowable wav value, bound them
if x < -32768:
return -32768
if x > 32767:
return 32767
return x
def encode_audio(in_file, out_file):
autoencoder = keras.models.load_model("audio_autoencoder.model")
in_layer = keras.layers.Input(shape=(1, 441))
encode = autoencoder.layers[1](in_layer)
encode = autoencoder.layers[2](encode)
encode = autoencoder.layers[3](encode)
encode = autoencoder.layers[4](encode)
encode = autoencoder.layers[5](encode)
encoder = keras.models.Model(in_layer, encode)
"""
Takes in a file path to read (a wav file)
and a file path to write the encoded file to
"""
# Read the file
data, chans, samps, width, samp_rate = dataFromWave(in_file)
# Turn the samples into a numpy array
data = np.array(data)
# Set our encoding frame width
# Experimentally determined that 1/100th of a second has decent results
rate = samp_rate // 100
# Rescale integer samples over range [-32768,32767] to floats over range [0.0,1.0]
data = data.astype(float) / float(pow(2, 15))
data += 1.0
data = data / 2.0
# Pad the samples with zeroes, if needed, to make the last encoding frame full
n_in = len(data)
p_size = n_in + (rate - (n_in % rate))
padded = np.zeros((p_size,))
padded[0:n_in] = data
# Construct input layer
inputs = padded.reshape(len(padded)//rate, 1, rate)
# Encode the data
encoded = encoder.predict(inputs)
# Save the encoded data, as well as the important parameters
np.savez_compressed(out_file, data=encoded, params=np.array([chans, samps, width, samp_rate]), Type=[1])
def decode_audio(in_file, out_file):
"""
This function takes in a file prefix to a data/model file pair,
and decodes a wav file from them at the provided location.
"""
# Load the model
autoencoder = keras.models.load_model("audio_autoencoder.model")
in_layer = keras.layers.Input(shape=(1, 441//16))
decode = autoencoder.layers[-4](in_layer)
decode = autoencoder.layers[-3](decode)
decode = autoencoder.layers[-2](decode)
decode = autoencoder.layers[-1](decode)
decoder = keras.models.Model(in_layer, decode)
# Load the data
ins = np.load(in_file + ".npz")
encoded = ins['data']
chans = ins['params'][0]
samps = ins['params'][1]
width = ins['params'][2]
samp_rate = ins['params'][3]
# Run the decoder
outputs = decoder.predict(encoded)
# Build a wav file
out = outputs.reshape(outputs.shape[0]*outputs.shape[-1])
if np.any(out > 0.85):
noisy_part = out[out > 0.85]
out = nr.reduce_noise(audio_clip=out, noise_clip=noisy_part)
out = (((out * 2.0) - 1.0) * float(pow(2, 15))).astype(int)
out = list(map(norm, out))
dataToWave(out_file + ".wav", out, chans, samps, width, samp_rate)