Python librosa 模块,load() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用librosa.load()

项目:gtzan.keras    作者:Hguimaraes    | 项目源码 | 文件源码
def getdata(self):
    # Structure for the array of songs
    song_data = []
    genre_data = []

    # Read files from the folders
    for x,_ in self.genres.items():
      for root, subdirs, files in os.walk(self.file_path + x):
        for file in files:
          # Read the audio file
            file_name = self.file_path + x + "/" + file
            print(file_name)
            signal, sr = librosa.load(file_name)

            # Calculate the melspectrogram of the audio and use log scale
            melspec = librosa.feature.melspectrogram(signal[:self.song_samples],
              sr = sr, n_fft = self.n_fft, hop_length = self.hop_length).T[:1280,]

            # Append the result to the data structure
            song_data.append(melspec)
            genre_data.append(self.genres[x])
    return np.array(song_data), keras.utils.to_categorical(genre_data, len(self.genres))
项目:speechless    作者:JuliusKunze    | 项目源码 | 文件源码
def __init__(self,
                 audio_file: Path,
                 id: Optional[str] = None,
                 sample_rate_to_convert_to: int = 16000,
                 label: Optional[str] = "nolabel",
                 fourier_window_length: int = 512,
                 hop_length: int = 128,
                 mel_frequency_count: int = 128,
                 label_with_tags: str = None,
                 positional_label: Optional[PositionalLabel] = None):
        # The default values for hop_length and fourier_window_length are powers of 2 near the values specified in the wave2letter paper.

        if id is None:
            id = name_without_extension(audio_file)

        self.audio_file = audio_file

        super().__init__(
            id=id, get_raw_audio=lambda: librosa.load(str(self.audio_file), sr=self.sample_rate)[0],
            label=label, sample_rate=sample_rate_to_convert_to,
            fourier_window_length=fourier_window_length, hop_length=hop_length, mel_frequency_count=mel_frequency_count,
            label_with_tags=label_with_tags, positional_label=positional_label)
项目:skill-voice-recognition    作者:TREE-Edu    | 项目源码 | 文件源码
def __init__(self, images, labels, fake_data=False, one_hot=False, load=False):
        """Construct a DataSet. one_hot arg is used only if fake_data is true."""
        if fake_data:
            self._num_examples = 10000
            self.one_hot = one_hot
        else:
            num = len(images)
            assert num == len(labels), ('images.shape: %s labels.shape: %s' % (images.shape, labels.shape))
            print("len(images) %d" % num)
            self._num_examples = num
        self.cache={}
        self._image_names = numpy.array(images)
        self._labels = labels
        self._epochs_completed = 0
        self._index_in_epoch = 0
        self._images=[]
        if load: # Otherwise loaded on demand
            self._images=self.load(self._image_names)
项目:skill-voice-recognition    作者:TREE-Edu    | 项目源码 | 文件源码
def read_data_sets(train_dir,source_data=Source.NUMBER_IMAGES, fake_data=False, one_hot=True):
    class DataSets(object):
        pass
    data_sets = DataSets()
    if fake_data:
        data_sets.train = DataSet([], [], fake_data=True, one_hot=one_hot)
        data_sets.validation = DataSet([], [], fake_data=True, one_hot=one_hot)
        data_sets.test = DataSet([], [], fake_data=True, one_hot=one_hot)
        return data_sets
    VALIDATION_SIZE = 2000
    local_file = maybe_download(source_data, train_dir)
    train_images = extract_images(TRAIN_INDEX,train=True)
    train_labels = extract_labels(TRAIN_INDEX,train=True, one_hot=one_hot)
    test_images = extract_images(TEST_INDEX,train=False)
    test_labels = extract_labels(TEST_INDEX,train=False, one_hot=one_hot)
    # train_images = train_images[:VALIDATION_SIZE]
    # train_labels = train_labels[:VALIDATION_SIZE:]
    # test_images = test_images[VALIDATION_SIZE:]
    # test_labels = test_labels[VALIDATION_SIZE:]
    data_sets.train = DataSet(train_images, train_labels , load=False)
    data_sets.test = DataSet(test_images, test_labels, load=True)
    # data_sets.validation = DataSet(validation_images, validation_labels, load=True)
    return data_sets
项目:magenta    作者:tensorflow    | 项目源码 | 文件源码
def load_audio(audio_filename, sample_rate):
  """Loads an audio file.

  Args:
    audio_filename: File path to load.
    sample_rate: The number of samples per second at which the audio will be
        returned. Resampling will be performed if necessary.

  Returns:
    A numpy array of audio samples, single-channel (mono) and sampled at the
    specified rate, in float32 format.

  Raises:
    AudioIOReadException: If librosa is unable to load the audio data.
  """
  try:
    y, unused_sr = librosa.load(audio_filename, sr=sample_rate, mono=True)
  except Exception as e:  # pylint: disable=broad-except
    raise AudioIOReadException(e)
  return y
项目:motif    作者:rabitt    | 项目源码 | 文件源码
def get_seeds(self, audio_filepath):
        """Get the seeds file to pass to the HLL tracker.

        Parameters
        ----------
        audio_filepath : str
            Path to audio file.

        Returns
        -------
        seeds_fpath : str
            Path to the seeds output file.

        """
        y, sr = librosa.load(audio_filepath, sr=44100)
        y_harmonic = librosa.effects.harmonic(y)
        cqt, samples, freqs = self._compute_cqt(y_harmonic, sr)
        seeds = self._pick_seeds_cqt(cqt, freqs, samples)

        seeds_fpath = tmp.mktemp('.csv')
        with open(seeds_fpath, 'w') as fhandle:
            writer = csv.writer(fhandle, delimiter=',')
            writer.writerows(seeds)
        return seeds_fpath
项目:EUSIPCO2017    作者:Veleslavia    | 项目源码 | 文件源码
def compute_spectrograms(filename):
    out_rate = 12000
    N_FFT = 512
    HOP_LEN = 256

    frames, rate = librosa.load(filename, sr=out_rate, mono=True)
    if len(frames) < out_rate*3:
        # if less then 3 second - can't process
        raise Exception("Audio duration is too short")

    logam = librosa.logamplitude
    melgram = librosa.feature.melspectrogram
    x = logam(melgram(y=frames, sr=out_rate, hop_length=HOP_LEN,
                      n_fft=N_FFT, n_mels=N_MEL_BANDS) ** 2,
              ref_power=1.0)

    # now going through spectrogram with the stride of the segment duration
    for start_idx in range(0, x.shape[1] - SEGMENT_DUR + 1, SEGMENT_DUR):
        yield x[:, start_idx:start_idx + SEGMENT_DUR]
项目:laughter    作者:ganesh-srinivas    | 项目源码 | 文件源码
def predict_on_long_clips():
    """Load the saved model and perform inference/prediction on features obtained from inputs. 
    Splits the audio into 10second chunks and predicts on those chunks."""
    with open(FILENAMES,"r") as fh:
        filecontents=fh.read()
        filenames=filecontents.splitlines()
        random.shuffle(filenames)
        filenames=filenames[:5] #[:5] is for quickly verifying if things work
        filenames = [DATASET_LOCATION+f for f in filenames]

    session = tf.Session()
    saver = tf.train.import_meta_graph(IMPORT_META_GRAPH)
    saver.restore(session, tf.train.latest_checkpoint(IMPORT_LATEST_CHECKPOINT))
    tf.global_variables_initializer().run(session=session)

    test_x = {}
    for f in filenames:
        s, sr = librosa.load(f)
        total_chunks = s.shape[0]/max_audio_length
        waveforms = [s[max_audio_length*i:max_audio_length*(i+1)] for i in range(total_chunks)]
        test_x[f] = extract_features_from_waveforms(waveforms)

        print "FILENAME: ", f
        predictions = session.run(tf.argmax(pred, 1), feed_dict={X: test_x[f]})
        print [possible_categories[p] for p in predictions]
项目:laughter    作者:ganesh-srinivas    | 项目源码 | 文件源码
def predict_on_long_clips():
    """Load the saved model and perform inference/prediction on features obtained from inputs. 
    Splits the audio into 10second chunks and predicts on those chunks."""
    with open(FILENAMES,"r") as fh:
        filecontents=fh.read()
        filenames=filecontents.splitlines()
        random.shuffle(filenames)
        filenames=filenames[:5] #[:5] is for quickly verifying if things work
        filenames = [DATASET_LOCATION+f for f in filenames]

    session = tf.Session()
    saver = tf.train.import_meta_graph(IMPORT_META_GRAPH)
    saver.restore(session, tf.train.latest_checkpoint(IMPORT_LATEST_CHECKPOINT))
    tf.global_variables_initializer().run(session=session)

    test_x = {}
    for f in filenames:
        s, sr = librosa.load(f)
        total_chunks = s.shape[0]/max_audio_length
        waveforms = [s[max_audio_length*i:max_audio_length*(i+1)] for i in range(total_chunks)]
        test_x[f] = extract_features_from_waveforms(waveforms)

        print "FILENAME: ", f
        predictions = session.run(tf.argmax(pred, 1), feed_dict={X: test_x[f]})
        print [possible_categories[p] for p in predictions]
项目:laughter    作者:ganesh-srinivas    | 项目源码 | 文件源码
def predict_on_long_clips():
    """Load the saved model and perform inference/prediction on features obtained from inputs. 
    Splits the audio into 10second chunks and predicts on those chunks."""
    with open(FILENAMES,"r") as fh:
        filecontents=fh.read()
        filenames=filecontents.splitlines()
        random.shuffle(filenames)
        filenames=filenames[:5] #[:5] is for quickly verifying if things work
        filenames = [DATASET_LOCATION+f for f in filenames]

    session = tf.Session()
    saver = tf.train.import_meta_graph(IMPORT_META_GRAPH)
    saver.restore(session, tf.train.latest_checkpoint(IMPORT_LATEST_CHECKPOINT))
    tf.global_variables_initializer().run(session=session)

    test_x = {}
    for f in filenames:
        s, sr = librosa.load(f)
        total_chunks = s.shape[0]/max_audio_length
        waveforms = [s[max_audio_length*i:max_audio_length*(i+1)] for i in range(total_chunks)]
        test_x[f] = extract_features_from_waveforms(waveforms)

        print "FILENAME: ", f
        predictions = session.run(tf.argmax(pred, 1), feed_dict={X: test_x[f]})
        print [possible_categories[p] for p in predictions]
项目:mmfeat    作者:douwekiela    | 项目源码 | 文件源码
def loadFile(self, fname):
        '''
        fname:      filename of the sound file we want to load
        '''
        if self.verbose: print('Loading %s' % fname)

        if self.cached:
            if not os.path.exists(fname + '-mfcc.npy'):
                y, sr = librosa.load(fname)
                data = mfcc(y=y, sr=sr).T
                np.save(fname + '-mfcc.npy', data)
            else:
                data = np.load(fname + '-mfcc.npy')
        else:
            y, sr = librosa.load(fname)
            # TODO: Add ability to filter by seconds/duration
            # seconds = y.size/sr
            data = mfcc(y=y, sr=sr).T

        return data
项目:audio-tagging-toolkit    作者:hipstas    | 项目源码 | 文件源码
def get_mfccs_and_deltas(wav_pathname, n_mfcc=13, n_fft=2048, freq_min=100, freq_max=16000):
    sample_array, sample_rate = librosa.load(wav_pathname, sr=44100)
    if len(sample_array) == 0:
        return []
    else:
        mfcc = librosa.feature.mfcc(sample_array, sample_rate, n_fft=n_fft, hop_length=n_fft, n_mfcc=n_mfcc, fmin=freq_min, fmax=freq_max)
        delta = librosa.feature.delta(mfcc)
        delta2 = librosa.feature.delta(mfcc, order=2)
        mfcc = mfcc.T  ### Transposing tables
        delta = delta.T  ## (We can instead set the axis above to do this without the extra step)
        delta2 = delta2.T
        mfcc_sans_0th = [frame_values[1:] for frame_values in mfcc]
        all_features = []
        for i in range(len(mfcc)):
            all_features.append(list(mfcc_sans_0th[i]) + list(delta[i]) + list(delta2[i]))
        return all_features
项目:toho_mir_ml    作者:kodack64    | 项目源码 | 文件源码
def recognise_mfcc(filePath,outputDir,outputName,debug):

    print("start decompose harmonic/percussive and extract mfcc {0}".format(filePath))
    y,sr = librosa.load(filePath)
    mfcc = librosa.feature.mfcc(y=y,sr=sr)
    mfcc = np.transpose(mfcc)
    basePath = outputDir+outputName;
    np.savetxt(basePath+"_normal_mfcc.csv",mfcc,delimiter=",")
    harmonic_sep = 3.0
    percussive_sep = 3.0
    h,p = librosa.effects.hpss(y,margin=(harmonic_sep,percussive_sep))
    hmfcc = librosa.feature.mfcc(y=h,sr=sr)
    hmfcc = np.transpose(hmfcc)
    np.savetxt(basePath+"_harmonic_mfcc.csv",hmfcc,delimiter=",")
    pmfcc = librosa.feature.mfcc(y=p,sr=sr)
    pmfcc = np.transpose(pmfcc)
    np.savetxt(basePath+"_percussive_mfcc.csv",pmfcc,delimiter=",")

# extract rhythm patter with rp_extract
项目:tensorflow-wavenet    作者:ibab    | 项目源码 | 文件源码
def load_generic_audio(directory, sample_rate):
    '''Generator that yields audio waveforms from the directory.'''
    files = find_files(directory)
    id_reg_exp = re.compile(FILE_PATTERN)
    print("files length: {}".format(len(files)))
    randomized_files = randomize_files(files)
    for filename in randomized_files:
        ids = id_reg_exp.findall(filename)
        if not ids:
            # The file name does not match the pattern containing ids, so
            # there is no id.
            category_id = None
        else:
            # The file name matches the pattern for containing ids.
            category_id = int(ids[0][0])
        audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
        audio = audio.reshape(-1, 1)
        yield audio, filename, category_id
项目:DropMuse    作者:DropMuse    | 项目源码 | 文件源码
def get_audio_analysis(song_url):
    if(song_url is None):
        return None, None, None, None, None
    urlretrieve(song_url, "current.mp3")
    y, sr = librosa.load("./current.mp3")

    # Tempo = beats/minute
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr)

    # pitch = Frequency
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr,
                                           fmax=1000, hop_length=1000)

    pitches, magnitudes = extract_max(pitches, magnitudes, pitches.shape)
    y[abs(y) < 10**-2] = 0
    y = np.trim_zeros(y)

    json = {
        'sound_wave': np.array(y[:len(pitches)]).tolist(),
        'pitch': pitches
    }
    y_harm, y_per = librosa.effects.hpss(y)
    harm, perc = audio_fingerprint(y_harm), audio_fingerprint(y_per)
    pitch_ave = np.average(pitches)
    return float(tempo), float(pitch_ave), float(harm), float(perc), json
项目:SpeechSeparation    作者:Unisound    | 项目源码 | 文件源码
def main():
    outdir = 'mix'
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    audio_total1, sr = librosa.load('./cao.wav', sr=sample_rate, mono=True)
    audio_total2, sr = librosa.load('./huang.wav', sr=sample_rate, mono=True)

    seglen = int(sav_n_secs * sr)

    len1 = audio_total1.shape[0] - seglen
    len2 = audio_total2.shape[0] - seglen

    for i in range(train_data_num):
      if i % 100 == 0:
        print(i)
      idx1=random.randint(0, len1)
      idx2=random.randint(0, len2)
      mix(audio_total1[idx1:idx1+seglen], audio_total2[idx2:idx2+seglen], sample_rate, sav_n_secs,outdir,i)
项目:gcforest    作者:w821881341    | 项目源码 | 文件源码
def save_cache(src_path, des_path, get_feature_func):
    des_path = osp.splitext(des_path)[0] + '.npy'
    try:
        X, sr = librosa.load(src_path)
        src = int(sr)
        feature = get_feature_func(X, sr)
        print('[INFO] Saving Cache in {} ...'.format(des_path))
        des_par = osp.abspath(osp.join(des_path, osp.pardir))
        if not osp.exists(des_par):
            os.makedirs(des_par)
    except Exception, e:
        print("[ERROR] Unkown error happend when dealing with{}".format(src_path))
        #print(e)
        return -1
    np.save(des_path, feature)
    return 0
项目:aed-by-cnn    作者:tweihaha    | 项目源码 | 文件源码
def adjust_volume(in_fp):
    def adjust(volume):
        audio_p = audio + volume
        fn_p = fn + "_" + str(volume) +"db" + ".wav"
        fd = audio_p.export(path.join(out_dir, str(volume) + 'db', path.split(in_dir)[-1], fn_p), format=format)

    in_dir, fn = path.split(in_fp)
    fn, file_ext = path.splitext(fn)
    file_ext = file_ext.lower()
    format = file_ext.replace('.', '')
    # audio = None
    y, sr = librosa.load(in_fp, sr=44100)
    tmp_in_fp = "tmp/" + fn + "_tmp.wav"
    librosa.output.write_wav(tmp_in_fp, y, sr, norm=False)
    format = "wav"
    audio = aseg.from_file(tmp_in_fp, format)
    os.remove(tmp_in_fp)

    if audio != None:
        for v in volume_list:
            adjust(v)
项目:crnn-music-genre-classification    作者:meetshah1995    | 项目源码 | 文件源码
def log_scale_melspectrogram(path, plot=False):
    signal, sr = lb.load(path, sr=Fs)
    n_sample = signal.shape[0]
    n_sample_fit = int(DURA*Fs)

    if n_sample < n_sample_fit:
        signal = np.hstack((signal, np.zeros((int(DURA*Fs) - n_sample,))))
    elif n_sample > n_sample_fit:
        signal = signal[(n_sample-n_sample_fit)/2:(n_sample+n_sample_fit)/2]

    melspect = lb.logamplitude(lb.feature.melspectrogram(y=signal, sr=Fs, hop_length=N_OVERLAP, n_fft=N_FFT, n_mels=N_MELS)**2, ref_power=1.0)

    if plot:
        melspect = melspect[np.newaxis, :]
        misc.imshow(melspect.reshape((melspect.shape[1],melspect.shape[2])))
        print(melspect.shape)

    return melspect
项目:EnglishSpeechUpsampler    作者:jhetherly    | 项目源码 | 文件源码
def read_file_pair(filename_pair, mono=True):
    """
    given a pair of file names, read in both waveforms and upsample (through
    librosa's default interpolation) the downsampled waveform
    assumes the file name pair is of the form ("original", "downsampled")
    mono selects whether to read in mono or stereo formatted waveforms

    returns a pair of numpy arrays representing the original and upsampled
    waveform
    """
    channel = 1 if mono else 2
    true_waveform, true_br = librosa.load(filename_pair[0], sr=None,
                                          mono=mono)
    ds_waveform, _ = librosa.load(filename_pair[1], sr=true_br, mono=mono)
    # truth, example
    return true_waveform.reshape((-1, channel)), \
        ds_waveform.reshape((-1, channel))
项目:vae-npvc    作者:JeremyCCHsu    | 项目源码 | 文件源码
def extract(filename, fft_size=FFT_SIZE, dtype=np.float32):
    ''' Basic (WORLD) feature extraction ''' 
    x, _ = librosa.load(filename, sr=args.fs, mono=True, dtype=np.float64)
    features = wav2pw(x, args.fs, fft_size=fft_size)
    ap = features['ap']
    f0 = features['f0'].reshape([-1, 1])
    sp = features['sp']
    en = np.sum(sp + EPSILON, axis=1, keepdims=True)
    sp = np.log10(sp / en)
    return np.concatenate([sp, ap, f0, en], axis=1).astype(dtype)
项目:Sound-classification-on-Raspberry-Pi-with-Tensorflow    作者:GianlucaPaolocci    | 项目源码 | 文件源码
def extract_features(file_name):
    X, sample_rate = librosa.load(file_name)
    stft = np.abs(librosa.stft(X))
    mfccs = np.array(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=8).T)
    chroma = np.array(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T)
    mel = np.array(librosa.feature.melspectrogram(X, sr=sample_rate).T)
    contrast = np.array(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T)
    tonnetz = np.array(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T)
    return mfccs,chroma,mel,contrast,tonnetz
项目:speechless    作者:JuliusKunze    | 项目源码 | 文件源码
def _load_from_cache(self):
        try:
            return numpy.load(str(self.spectrogram_cache_file))
        except ValueError:
            log("Recalculating cached file {} because loading failed.".format(self.spectrogram_cache_file))
            return self._calculate_and_save_spectrogram()
项目:crema    作者:bmcfee    | 项目源码 | 文件源码
def SIGNAL():
    y, sr = librosa.load(librosa.util.example_audio_file(),
                         sr=None)
    return y, sr
项目:crema    作者:bmcfee    | 项目源码 | 文件源码
def SIGNAL():
    y, sr = librosa.load(librosa.util.example_audio_file(),
                         sr=None)
    return y, sr
项目:mugen    作者:scherroman    | 项目源码 | 文件源码
def create_marked_audio_file(mark_locations: Union[List[float], np.ndarray], output_path: Opt[str] = None, *,
                             audio_file: Opt[str] = None, duration: float = None):
    if audio_file:
        y, sr = librosa.load(audio_file)
        marked_audio = librosa.core.clicks(times=mark_locations, sr=sr, length=len(y))
        marked_audio = y + marked_audio
    elif duration:
        sr = 22050
        marked_audio = librosa.core.clicks(times=mark_locations, sr=sr, length=int(sr * duration))
    else:
        raise ParameterError("Must provide either audio file or duration.")

    librosa.output.write_wav(path=output_path, y=marked_audio, sr=sr)

    return output_path
项目:mugen    作者:scherroman    | 项目源码 | 文件源码
def __init__(self, file: str, *, sample_rate: int = 44100):
        """        
        Parameters
        ----------
        file
            Audio file to load
        """

        self.file = file
        self.samples, self.sample_rate = librosa.load(file, sr=sample_rate)
        self.duration = librosa.get_duration(y=self.samples, sr=self.sample_rate)
项目:aurora    作者:caretcaret    | 项目源码 | 文件源码
def clip_audio(specs, raw_audio, output):
  # Load the spec data. In clipping audio, we hold the specs fixed.
  spec_filenames = next(os.walk(specs))[2]
  if len(spec_filenames) == 0:
    print("No specs found.")
    return
  for spec_filename in spec_filenames:
    with open(os.path.join(specs, spec_filename)) as f:
      spec = json.load(f)
    youtube_id = spec['audio_source']['youtube_id']
    start_time = spec['audio_source']['start_time']
    end_time = spec['audio_source']['end_time']

    raw_audio_filenames = glob.glob(os.path.join(raw_audio, youtube_id + '.*'))
    if len(raw_audio_filenames) == 0:
      # No audio file found, skip.
      continue
    raw_audio_filename = raw_audio_filenames[0]
    raw_audio_extension = os.path.splitext(raw_audio_filename)[1]
    clip_filename = os.path.join(
        output, CLIP_NAME_PATTERN.format(youtube_id, start_time, end_time) +
        raw_audio_extension)

    # Call ffmpeg to output the trimmed clip.
    os.makedirs(os.path.dirname(clip_filename), exist_ok=True)
    call1 = ['ffmpeg', '-loglevel', 'error', '-n',
             '-ss', str(start_time), '-t', str(end_time - start_time),
             '-i', raw_audio_filename]
    if raw_audio_extension == 'ogg':
      call2 = ['-codec:a', 'libvorbis', '-strict', 'experimental']
    else:
      call2 = []
    call3 = [clip_filename]
    process = subprocess.run(call1 + call2 + call3)
    if process.returncode != 0:
      print("Error: {} encountered by {}".format(
          process.returncode, clip_filename))
    else:
      print(clip_filename)
项目:nnmnkwii    作者:r9y9    | 项目源码 | 文件源码
def test_dtw_aligner():
    x, fs = librosa.load(example_audio_file(), sr=None)
    assert fs == 16000
    x_fast = librosa.effects.time_stretch(x, 2.0)

    X = _get_mcep(x, fs)
    Y = _get_mcep(x_fast, fs)

    D = X.shape[-1]

    # Create padded pair
    X, Y = adjast_frame_lengths(X, Y, divisible_by=2)

    # Add utterance axis
    X = X.reshape(1, -1, D)
    Y = Y.reshape(1, -1, D)

    X_aligned, Y_aligned = DTWAligner().transform((X, Y))
    assert X_aligned.shape == Y_aligned.shape
    assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)

    X_aligned, Y_aligned = IterativeDTWAligner(
        n_iter=2, max_iter_gmm=10, n_components_gmm=2).transform((X, Y))
    assert X_aligned.shape == Y_aligned.shape
    assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)

    # Custom dist function
    from nnmnkwii.metrics import melcd
    X_aligned, Y_aligned = DTWAligner(dist=melcd).transform((X, Y))
    assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)
项目:skill-voice-recognition    作者:TREE-Edu    | 项目源码 | 文件源码
def mfcc_batch_generator(batch_size=10, source=Source.DIGIT_WAVES, target=Target.digits):
    maybe_download(source, DATA_DIR)
    if target == Target.speaker: speakers = get_speakers()
    batch_features = []
    labels = []
    files = os.listdir(path)
    while True:
        print("loaded batch of %d files" % len(files))
        shuffle(files)
        for file in files:
            if not file.endswith(".wav"): continue
            wave, sr = librosa.load(path+file, mono=True)
            mfcc = librosa.feature.mfcc(wave, sr)
            if target==Target.speaker: label=one_hot_from_item(speaker(file), speakers)
            elif target==Target.digits:  label=dense_to_one_hot(int(file[0]),10)
            elif target==Target.first_letter:  label=dense_to_one_hot((ord(file[0]) - 48) % 32,32)
            elif target == Target.hotword: label = one_hot_word(file, pad_to=max_word_length)  #
            elif target == Target.word: label=string_to_int_word(file, pad_to=max_word_length)
                # label = file  # sparse_labels(file, pad_to=20)  # max_output_length
            else: raise Exception("todo : labels for Target!")
            labels.append(label)
            # print(np.array(mfcc).shape)
            mfcc=np.pad(mfcc,((0,0),(0,80-len(mfcc[0]))), mode='constant', constant_values=0)
            batch_features.append(np.array(mfcc))
            if len(batch_features) >= batch_size:
                # if target == Target.word:  labels = sparse_labels(labels)
                # labels=np.array(labels)
                # print(np.array(batch_features).shape)
                # yield np.array(batch_features), labels
                # print(np.array(labels).shape) # why (64,) instead of (64, 15, 32)? OK IFF dim_1==const (20)
                yield batch_features, labels  # basic_rnn_seq2seq inputs must be a sequence
                batch_features = []  # Reset for next batch
                labels = []


# If you set dynamic_pad=True when calling tf.train.batch the returned batch will be automatically padded with 0s. Handy! A lower-level option is to use tf.PaddingFIFOQueue.
# only apply to a subset of all images at one time
项目:skill-voice-recognition    作者:TREE-Edu    | 项目源码 | 文件源码
def load(self,image_names):
        print("loading %d images"%len(image_names))
        return list(map(self.load_image,image_names)) # python3 map object WTF
项目:skill-voice-recognition    作者:TREE-Edu    | 项目源码 | 文件源码
def next_batch(self, batch_size, fake_data=False):
        """Return the next `batch_size` examples from this data set."""
        if fake_data:
            fake_image = [1] * width * height
            if self.one_hot:
                fake_label = [1] + [0] * 9
            else:
                fake_label = 0
            return [fake_image for _ in xrange(batch_size)], [
                    fake_label for _ in xrange(batch_size)]
        start = self._index_in_epoch
        self._index_in_epoch += batch_size
        if self._index_in_epoch > self._num_examples:
            # Finished epoch
            self._epochs_completed += 1
            # Shuffle the data
            perm = numpy.arange(self._num_examples)
            numpy.random.shuffle(perm)
            # self._images = self._images[perm]
            self._image_names = self._image_names[perm]
            self._labels = self._labels[perm]
            # Start next epoch
            start = 0
            self._index_in_epoch = batch_size
            assert batch_size <= self._num_examples
        end = self._index_in_epoch
        return self.load(self._image_names[start:end]), self._labels[start:end]


# multi-label
项目:magenta    作者:tensorflow    | 项目源码 | 文件源码
def load_audio(path, sample_length=64000, sr=16000):
  """Loading of a wave file.

  Args:
    path: Location of a wave file to load.
    sample_length: The truncated total length of the final wave file.
    sr: Samples per a second.

  Returns:
    out: The audio in samples from -1.0 to 1.0
  """
  audio, _ = librosa.load(path, sr=sr)
  audio = audio[:sample_length]
  return audio
项目:the-wavenet-pianist    作者:821760408-sp    | 项目源码 | 文件源码
def load_generic_audio(directory, sample_rate):
    """Generator that yields audio waveforms from the directory."""

    def randomize_files(fns):
        for _ in fns:
            file_index = random.randint(0, len(fns) - 1)
            yield fns[file_index]

    files = find_files(directory)
    id_reg_exp = re.compile(FILE_PATTERN)
    print("files length: {}".format(len(files)))
    randomized_files = randomize_files(files)
    for filename in randomized_files:
        ids = id_reg_exp.findall(filename)
        if not ids:
            # The file name does not match the pattern containing ids, so
            # there is no id.
            category_id = None
        else:
            # The file name matches the pattern for containing ids.
            category_id = int(ids[0][0])
        audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
        # Normalize audio
        audio = librosa.util.normalize(audio) * 0.8
        # Trim the last 5 seconds to account for music rollout
        audio = audio[:-5 * sample_rate]
        audio = np.reshape(audio, (-1, 1))
        yield audio, filename, category_id
项目:the-wavenet-pianist    作者:821760408-sp    | 项目源码 | 文件源码
def load_wav(wavfile, sr, mono=True):
    audio, _ = librosa.load(wavfile, sr=sr, mono=mono)
    # Normalize audio
    audio = librosa.util.normalize(audio) * 0.8
    lc = AudioReader.midi_notes_encoding(audio)

    fn = os.path.abspath(wavfile).strip('.wav')
    fn = "{}_lc_embedding.npy".format(fn)
    with open(fn, 'w') as f:
        np.save(f, lc)
项目:the-wavenet-pianist    作者:821760408-sp    | 项目源码 | 文件源码
def create_seed(filename,
                sample_rate,
                quantization_channels,
                window_size):
    audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
    quantized = mu_law_encode(audio, quantization_channels)
    cut_index = tf.cond(tf.size(quantized) < tf.constant(window_size),
                        lambda: tf.size(quantized),
                        lambda: tf.constant(window_size))

    return quantized[:cut_index]
项目:the-wavenet-pianist    作者:821760408-sp    | 项目源码 | 文件源码
def load_lc_embedding(lc_embedding):
    with open(lc_embedding, 'r') as f:
        return np.load(f)
项目:pyVSR    作者:georgesterpu    | 项目源码 | 文件源码
def read_wav_file(file):
    r"""
    Loads wav files from disk and resamples to 22050 Hz
    The output is shaped as [timesteps, 1]
    Parameters
    ----------
    file

    Returns
    -------

    """
    import librosa
    data, sr = librosa.load(file)
    return np.expand_dims(data, axis=-1)
项目:TensorFlow_AudioSet_Example    作者:DantesLegacy    | 项目源码 | 文件源码
def load_sound_files(file_paths):
    raw_sounds = []
    for fp in file_paths:
        X,sr = librosa.load(fp)
        raw_sounds.append(X)
    return raw_sounds
项目:TensorFlow_AudioSet_Example    作者:DantesLegacy    | 项目源码 | 文件源码
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs,chroma,mel,contrast,tonnetz
项目:CNN-for-single-channel-speech-enhancement    作者:zhr1201    | 项目源码 | 文件源码
def norm_audio(self):
        '''Normalize the audio files
        used before training using a independent script'''
        for file in self.audiofiles:
            audio, sr = librosa.load(file, sr=16000)
            div_fac = 1 / np.max(np.abs(audio)) / 3.0
            audio = audio * div_fac
            librosa.output.write_wav(file, audio, sr)
        for file in self.noisefiles:
            audio, sr = librosa.load(file, sr=16000)
            div_fac = 1 / np.max(np.abs(audio)) / 3.0
            audio = audio * div_fac
            librosa.output.write_wav(file, audio, sr)
项目:Personal_AI_Assistant    作者:PratylenClub    | 项目源码 | 文件源码
def callback(recognizer, audio):
    try:
        sentence = recognizer.recognize_google(audio, language=language)
        wave_file_name = "train.wav"
        wav_file = open(wave_file_name,"wb")
        wav_file.write(audio.get_wav_data())
        wav_file.close()
        wave, sample_rate = librosa.load(wave_file_name, mono=True, sr=None)
        wave = wave[::3]
        save_recording(wave_file_name,wave,sentence,CSV_BIG_ONE)

    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio")
    except sr.RequestError as e:
        print("Could not request results from Google Speech Recognition service; {0}".format(e))
项目:srnn    作者:marcofraccaro    | 项目源码 | 文件源码
def load_wav_files(files):
    wav_files = []
    for i, f in enumerate(files):
        print i, f
        wav_files += [librosa.load(f, sr=SAMPLINGRATE)[0]]
    return wav_files
项目:tacotron    作者:jinfagang    | 项目源码 | 文件源码
def get_spectrograms(sound_file):
    '''Extracts melspectrogram and log magnitude from given `sound_file`.
    Args:
      sound_file: A string. Full path of a sound file.

    Returns:
      Transposed S: A 2d array. A transposed melspectrogram with shape of (T, n_mels)
      Transposed magnitude: A 2d array.Has shape of (T, 1+hp.n_fft//2)
    '''
    # Loading sound file
    y, sr = librosa.load(sound_file, sr=None)  # or set sr to hp.sr.

    # stft. D: (1+n_fft//2, T)
    D = librosa.stft(y=y,
                     n_fft=hp.n_fft,
                     hop_length=hp.hop_length,
                     win_length=hp.win_length)

    # magnitude spectrogram
    magnitude = np.abs(D)  # (1+n_fft/2, T)

    # power spectrogram
    power = magnitude ** 2  # (1+n_fft/2, T)

    # mel spectrogram
    S = librosa.feature.melspectrogram(S=power, n_mels=hp.n_mels)  # (n_mels, T)

    return np.transpose(S.astype(np.float32)), np.transpose(magnitude.astype(np.float32))  # (T, n_mels), (T, 1+n_fft/2)
项目:aupyom    作者:pierre-rouanet    | 项目源码 | 文件源码
def test_load_sound(self):
        s1 = Sound.from_file(self.audio_file)

        y, sr = librosa.load(self.audio_file)
        s2 = Sound(y, sr)

        self.assertTrue(numpy.all(s1.y == s2.y))

        s3 = Sound(numpy.random.rand(random.randint(1, 100000)),
                   random.choice((88200, 44100, 22050, 11025)))
项目:aupyom    作者:pierre-rouanet    | 项目源码 | 文件源码
def from_file(cls, filename, sr=22050):
        """ Loads an audiofile, uses sr=22050 by default. """
        y, sr = librosa.load(filename, sr=sr)
        return cls(y, sr)

    # Chunk iterator
项目:DeepRemix    作者:DeepRemix    | 项目源码 | 文件源码
def parse_wav(filename, n_mfcc=40):
    '''
    Parses a single wav file into MFCC's and sample rate.

    Arguments:
        filename - Name of input wav file.
        n_mfcc   - Number of coefficients to use.

    Returns:
        A tuple with a numpy array with cepstrum coefficients, and sample rate.

    Raises:

    '''

    song_data = np.array([])
    sample_rate = -1
    if filename[-4:] == '.wav':
        try:
            y_data, sample_rate = librosa.load(filename)
            #  will need to experiment with different values for n_mfcc
            song_data = librosa.feature.mfcc(y=y_data,
                                             sr=sample_rate,
                                             n_mfcc=n_mfcc)
        except:
            sys.exit(1)

    return (song_data, sample_rate)
项目:EUSIPCO2017    作者:Veleslavia    | 项目源码 | 文件源码
def compute_spectrograms(filename):
    out_rate = 22050

    frames, rate = librosa.load(filename, sr=out_rate, mono=True)
    if len(frames) < out_rate:
        # if less then 1 second - can't process
        raise Exception("Audio duration is too short")

    normalized_audio = _normalize(frames)
    melspectr = librosa.feature.melspectrogram(y=normalized_audio, sr=out_rate, n_mels=N_MEL_BANDS, fmax=out_rate/2)
    logmelspectr = librosa.logamplitude(melspectr**2, ref_power=1.0)

    # now going through spectrogram with the stride of the segment duration
    for start_idx in range(0, logmelspectr.shape[1] - SEGMENT_DUR + 1, SEGMENT_DUR):
        yield logmelspectr[:, start_idx:start_idx + SEGMENT_DUR]
项目:gcForest    作者:kingfengji    | 项目源码 | 文件源码
def get_feature_aqibsaeed_1(X, sr, au_path=None):
    """
    http://aqibsaeed.github.io/2016-09-03-urban-sound-classification-part-1/
    """
    import librosa
    if au_path is not None:
        X, sr = librosa.load(au_path)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sr, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sr).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sr).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sr).T,axis=0)
    feature = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
    return feature
项目:gcForest    作者:kingfengji    | 项目源码 | 文件源码
def __init__(self, cache=None, **kwargs):
        super(GTZAN, self).__init__(**kwargs)
        if kwargs.get('conf') is not None:
            conf = kwargs['conf']
            cache = conf.get('cache', None)
        data_set_path = osp.join(DEFAULT_IMAGEST_BASE, self.data_set)
        self.data_set_path = data_set_path
        self.cache = cache
        X, y = parse_anno_file(data_set_path)
        if cache == 'raw':
            import librosa
            from tqdm import trange
            X_new = np.zeros((len(X), 1, 661500, 1))
            for i in trange(len(X)):
                x,_ = librosa.load(osp.join(DEFAULT_DATA_BASE, X[i]))
                x_len = min(661500, len(x))
                X_new[i,:,:x_len,0] = x[:x_len]
        if cache is not None and cache != 'raw':
            X = self.load_cache_X(X, cache)
            if cache == 'mfcc':
                X_new = np.zeros((len(X), X[0].shape[0], 1280, 1))
                for i, x in enumerate(X):
                    x_len = min(x.shape[1], 1280)
                    X_new[i,:,:x_len,0] = x[:,:x_len]
                X = X_new

        # layout_X
        if self.layout_x == 'rel_path':
            self.X = X
        else:
            self.X = self.init_layout_X(X)
        # layout_y
        self.y = self.init_layout_y(y)