NeuralNetwork-Based-Generic-Compression/EncoderDecoderAudio.py at master · SaraHisham/NeuralNetwork-Based-Generic-Compression · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import wave, struct
import keras
import noisereduce as nr
import numpy as np


def dataFromWave(fname):
    """
    Reads a wav file to samples
    """
    f = wave.open(fname, 'rb')
    # Read Channel Number
    chans = f.getnchannels()
    # Get raw sample count
    samps = f.getnframes()
    # Get bit-width of samples
    sampwidth = f.getsampwidth()
    # Get sampling rate
    rate = f.getframerate()
    # Read samples
    if sampwidth == 3:  # have to read this one sample at a time
        s = ''
        for k in range(samps):
            fr = f.readframes(1)
            for c in range(0, 3 * chans, 3):
                s += '\0' + fr[c:(c + 3)]  # put TRAILING 0 to make 32-bit (file is little-endian)
    else:
        s = f.readframes(samps)
    f.close()
    # Unpack samples
    unpstr = '<{0}{1}'.format(samps * chans, {1: 'b', 2: 'h', 3: 'i', 4: 'i', 8: 'q'}[sampwidth])
    x = list(struct.unpack(unpstr, s))
    if sampwidth == 3:
        x = [k >> 8 for k in x]  # downshift to get +/- 2^24 with sign extension

    return x, chans, samps, sampwidth, rate


def dataToWave(fname, data, chans, samps, sampwidth, rate):
    """
    Writes samples to a wav file
    """
    obj = wave.open(fname, 'wb')
    # Set parameters
    obj.setnchannels(chans)
    obj.setsampwidth(sampwidth)
    obj.setframerate(rate)
    # set up the packaging format
    packstr = "<{0}".format({1: 'b', 2: 'h', 3: 'i', 4: 'i', 8: 'q'}[sampwidth])
    # Package the samples
    for i in range(samps * chans):
        obj.writeframesraw(struct.pack(packstr, data[i]))
    obj.close()


def norm(x):
    """
    NN output isn't quite perfect, make sure it's bounded
    """
    # If we're outside allowable wav value, bound them
    if x < -32768:
        return -32768
    if x > 32767:
        return 32767
    return x


def encode_audio(in_file, out_file):
    autoencoder = keras.models.load_model("audio_autoencoder.model")
    in_layer = keras.layers.Input(shape=(1, 441))
    encode = autoencoder.layers[1](in_layer)
    encode = autoencoder.layers[2](encode)
    encode = autoencoder.layers[3](encode)
    encode = autoencoder.layers[4](encode)
    encode = autoencoder.layers[5](encode)
    encoder = keras.models.Model(in_layer, encode)
    """
    Takes in a file path to read (a wav file)
    and a file path to write the encoded file to
    """
    # Read the file
    data, chans, samps, width, samp_rate = dataFromWave(in_file)

    # Turn the samples into a numpy array
    data = np.array(data)

    # Set our encoding frame width
    # Experimentally determined that 1/100th of a second has decent results
    rate = samp_rate // 100
    # Rescale integer samples over range [-32768,32767] to floats over range [0.0,1.0]
    data = data.astype(float) / float(pow(2, 15))
    data += 1.0
    data = data / 2.0
    # Pad the samples with zeroes, if needed, to make the last encoding frame full
    n_in = len(data)
    p_size = n_in + (rate - (n_in % rate))
    padded = np.zeros((p_size,))
    padded[0:n_in] = data

    # Construct input layer
    inputs = padded.reshape(len(padded)//rate, 1, rate)

    # Encode the data
    encoded = encoder.predict(inputs)
    # Save the encoded data, as well as the important parameters
    np.savez_compressed(out_file, data=encoded, params=np.array([chans, samps, width, samp_rate]), Type=[1])


def decode_audio(in_file, out_file):
    """
    This function takes in a file prefix to a data/model file pair,
    and decodes a wav file from them at the provided location.
    """
    # Load the model
    autoencoder = keras.models.load_model("audio_autoencoder.model")
    in_layer = keras.layers.Input(shape=(1, 441//16))
    decode = autoencoder.layers[-4](in_layer)
    decode = autoencoder.layers[-3](decode)
    decode = autoencoder.layers[-2](decode)
    decode = autoencoder.layers[-1](decode)
    decoder = keras.models.Model(in_layer, decode)
    # Load the data
    ins = np.load(in_file + ".npz")
    encoded = ins['data']
    chans = ins['params'][0]
    samps = ins['params'][1]
    width = ins['params'][2]
    samp_rate = ins['params'][3]
    # Run the decoder
    outputs = decoder.predict(encoded)

    # Build a wav file
    out = outputs.reshape(outputs.shape[0]*outputs.shape[-1])

    if np.any(out > 0.85):
        noisy_part = out[out > 0.85]
        out = nr.reduce_noise(audio_clip=out, noise_clip=noisy_part)

    out = (((out * 2.0) - 1.0) * float(pow(2, 15))).astype(int)

    out = list(map(norm, out))

    dataToWave(out_file + ".wav", out, chans, samps, width, samp_rate)