Python tensorflow.python.platform.gfile 模块,Exists() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用tensorflow.python.platform.gfile.Exists()

项目:tensorflow_seq2seq_chatbot    作者:higepon    | 项目源码 | 文件源码
def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=japanese_tokenizer, normalize_digits=True):
    if not gfile.Exists(target_path):
        print("Tokenizing data in %s" % data_path)
        vocab, _ = initialize_vocabulary(vocabulary_path)
        with gfile.GFile(data_path, mode="rb") as data_file:
            with gfile.GFile(target_path, mode="wb") as tokens_file:  # edit w to wb
                counter = 0
                for line in data_file:
#                    line = tf.compat.as_bytes(line)  # added by Ken
                    counter += 1
                    if counter % 100000 == 0:
                        print("  tokenizing line %d" % counter)
                    # line is binary here
                    line = line.decode('utf-8')
                    token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                                      normalize_digits)
                    tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")


# Originally from https://github.com/1228337123/tensorflow-seq2seq-chatbot
项目:basic-encoder-decoder    作者:pemywei    | 项目源码 | 文件源码
def initialize_vocabulary(vocabulary_path):
    """
    Initialize vocabulary from file.
    Args:
        vocabulary_path: path to the file containing the vocabulary.
    Returns:
        a pair: the vocabulary (a dictionary mapping string to integers), and
        the reversed vocabulary (a list, which reverses the vocabulary mapping).
    Raises:
        ValueError: if the provided vocabulary_path does not exist.
    """
    if gfile.Exists(vocabulary_path):
        rev_vocab = []
        with gfile.GFile(vocabulary_path, mode="rb") as f:
            rev_vocab.extend(f.readlines())
        rev_vocab = [line.strip() for line in rev_vocab]
        vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
        return vocab, rev_vocab
    else:
        raise ValueError("Vocabulary file %s not found.", vocabulary_path)
项目:basic-encoder-decoder    作者:pemywei    | 项目源码 | 文件源码
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None):
    """
    Tokenize data file and turn into token-ids using given vocabulary file.

    This function loads data line-by-line from data_path, calls the above
    sentence_to_token_ids, and saves the result to target_path. See comment
    for sentence_to_token_ids on the details of token-ids format.

    Args:
        data_path: path to the data file in one-sentence-per-line format.
        target_path: path where the file with token-ids will be created.
        vocabulary_path: path to the vocabulary file.
        tokenizer: a function to use to tokenize each sentence;
        if None, basic_tokenizer will be used.
    """
    if not gfile.Exists(target_path):
        print("Tokenizing data in %s" % data_path)
        vocab, _ = initialize_vocabulary(vocabulary_path)
        with gfile.GFile(data_path, mode="rb") as data_file:
            with gfile.GFile(target_path, mode="w") as tokens_file:
                for line in data_file:
                    token_ids = sentence_to_token_ids(line, vocab, tokenizer)
                    tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
项目:tensorflow-image-classifier    作者:burliEnterprises    | 项目源码 | 文件源码
def create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
                           image_dir, category, sess, jpeg_data_tensor,
                           bottleneck_tensor):
  """Create a single bottleneck file."""
  print('Creating bottleneck at ' + bottleneck_path)
  image_path = get_image_path(image_lists, label_name, index,
                              image_dir, category)
  if not gfile.Exists(image_path):
    tf.logging.fatal('File does not exist %s', image_path)
  image_data = gfile.FastGFile(image_path, 'rb').read()
  try:
    bottleneck_values = run_bottleneck_on_image(
        sess, image_data, jpeg_data_tensor, bottleneck_tensor)
  except:
    raise RuntimeError('Error during processing file %s' % image_path)

  bottleneck_string = ','.join(str(x) for x in bottleneck_values)
  with open(bottleneck_path, 'w') as bottleneck_file:
    bottleneck_file.write(bottleneck_string)
项目:Question-Answering    作者:MurtyShikhar    | 项目源码 | 文件源码
def create_vocabulary(vocabulary_path, data_paths, tokenizer=None):
    if not gfile.Exists(vocabulary_path):
        print("Creating vocabulary %s from data %s" % (vocabulary_path, str(data_paths)))
        vocab = {}
        for path in data_paths:
            with open(path, mode="rb") as f:
                counter = 0
                for line in f:
                    counter += 1
                    if counter % 100000 == 0:
                        print("processing line %d" % counter)
                    tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
                    for w in tokens:
                        if w in vocab:
                            vocab[w] += 1
                        else:
                            vocab[w] = 1
        vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
        print("Vocabulary size: %d" % len(vocab_list))
        with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
            for w in vocab_list:
                vocab_file.write(w + b"\n")
项目:deep-news-summarization    作者:hengluchang    | 项目源码 | 文件源码
def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):

  if not gfile.Exists(target_path):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with gfile.GFile(data_path, mode="r") as data_file:
      with gfile.GFile(target_path, mode="w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 1000 == 0:
            print("  tokenizing line %d" % counter)
          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                            normalize_digits)
          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
项目:dahoam2017    作者:KarimJedda    | 项目源码 | 文件源码
def create_data_list(image_dir):
  if not gfile.Exists(image_dir):
    print("Image director '" + image_dir + "' not found.")
    return None
  extensions = ['jpg', 'JPG', 'jpeg', 'JPEG', 'png', 'PNG']
  print("Looking for images in '" + image_dir + "'")
  file_list = []
  for extension in extensions:
    file_glob = os.path.join(image_dir, '*.' + extension)
    file_list.extend(gfile.Glob(file_glob))
  if not file_list:
    print("No files found in '" + image_dir + "'")
    return None
  images = []
  labels = []
  for file_name in file_list:
    image = Image.open(file_name)
    image_gray = image.convert('L')
    image_resize = image_gray.resize(size=(IMAGE_WIDTH,IMAGE_HEIGHT))
    input_img = np.array(image_resize, dtype='int16')
    image.close()
    label_name = os.path.basename(file_name).split('_')[0]
    images.append(input_img)
    labels.append(label_name)
  return zip(images, labels)
项目:powerai-transfer-learning    作者:IBM    | 项目源码 | 文件源码
def create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
                           image_dir, category, sess, jpeg_data_tensor,
                           bottleneck_tensor):
  """Create a single bottleneck file."""
  print('Creating bottleneck at ' + bottleneck_path)
  image_path = get_image_path(image_lists, label_name, index,
                              image_dir, category)
  if not gfile.Exists(image_path):
    tf.logging.fatal('File does not exist %s', image_path)
  image_data = gfile.FastGFile(image_path, 'rb').read()
  try:
    bottleneck_values = run_bottleneck_on_image(
        sess, image_data, jpeg_data_tensor, bottleneck_tensor)
  except:
    raise RuntimeError('Error during processing file %s' % image_path)

  bottleneck_string = ','.join(str(x) for x in bottleneck_values)
  with open(bottleneck_path, 'w') as bottleneck_file:
    bottleneck_file.write(bottleneck_string)
项目:image-classification-tensorflow    作者:xuetsing    | 项目源码 | 文件源码
def create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
                           image_dir, category, sess, jpeg_data_tensor,
                           bottleneck_tensor):
    """Create a single bottleneck file."""
    print('Creating bottleneck at ' + bottleneck_path)
    image_path = get_image_path(image_lists, label_name, index,
                              image_dir, category)
    if not gfile.Exists(image_path):
        tf.logging.fatal('File does not exist %s', image_path)
    image_data = gfile.FastGFile(image_path, 'rb').read()
    try:
        bottleneck_values = run_bottleneck_on_image(
            sess, image_data, jpeg_data_tensor, bottleneck_tensor)
    except:
        raise RuntimeError('Error during processing file %s' % image_path)

    bottleneck_string = ','.join(str(x) for x in bottleneck_values)
    with open(bottleneck_path, 'w') as bottleneck_file:
        bottleneck_file.write(bottleneck_string)
项目:seq2seq-webchatbot    作者:zhaoyingjun    | 项目源码 | 文件源码
def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):

  if not gfile.Exists(target_path):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with gfile.GFile(data_path, mode="rb") as data_file:
      with gfile.GFile(target_path, mode="w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 100000 == 0:
            print("  tokenizing line %d" % counter)
          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                            normalize_digits)
          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
项目:DMN-tensorflow    作者:sufengniu    | 项目源码 | 文件源码
def combine_shuffle(data_dir, filename):
  if not (gfile.Exists(os.path.join(data_dir, filename + '_train.txt')) and gfile.Exists(os.path.join(data_dir, filename + '_test.txt'))):
    data_train = []
    data_test = []
    print ('Shuffle file in %s' % data_dir)
    for subdir, dirs, files in os.walk(data_dir):
      for afile in files:
        with gfile.GFile(os.path.join(subdir, afile), mode="r") as f:
          if afile.endswith("train.txt"):
            data_train.append(f.read())
          else:
            data_test.append(f.read())

    with gfile.GFile(os.path.join(data_dir, filename + '_train.txt'), mode="w") as train_file:
      train_file.write(''.join(data_train))
      train_file.close()
    with gfile.GFile(os.path.join(data_dir, filename + '_test.txt'), mode="w") as test_file:
      test_file.write(''.join(data_test))
      test_file.close()
项目:tensorflow-yys    作者:ystyle    | 项目源码 | 文件源码
def create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
                           image_dir, category, sess, jpeg_data_tensor,
                           bottleneck_tensor):
  """Create a single bottleneck file."""
  print('Creating bottleneck at ' + bottleneck_path)
  image_path = get_image_path(image_lists, label_name, index,
                              image_dir, category)
  if not gfile.Exists(image_path):
    tf.logging.fatal('File does not exist %s', image_path)
  image_data = gfile.FastGFile(image_path, 'rb').read()
  try:
    bottleneck_values = run_bottleneck_on_image(
        sess, image_data, jpeg_data_tensor, bottleneck_tensor)
  except:
    raise RuntimeError('Error during processing file %s' % image_path)

  bottleneck_string = ','.join(str(x) for x in bottleneck_values)
  with open(bottleneck_path, 'w') as bottleneck_file:
    bottleneck_file.write(bottleneck_string)
项目:joint-slu-lm    作者:HadoopIt    | 项目源码 | 文件源码
def create_label_vocab(vocabulary_path, data_path):
  if not gfile.Exists(vocabulary_path):
    print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
    vocab = {}
    with gfile.GFile(data_path, mode="r") as f:
      counter = 0
      for line in f:
        counter += 1
        if counter % 100000 == 0:
          print("  processing line %d" % counter)
#        tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
        label = line.strip()
        vocab[label] = 1
      label_list = START_VOCAB_dict['no_padding'] + sorted(vocab)
#      label_list = sorted(vocab)
      with gfile.GFile(vocabulary_path, mode="w") as vocab_file:
        for k in label_list:
          vocab_file.write(k + "\n")
项目:Technical-Analysis-And-Practice-in-TensorFlow    作者:greatgeekgrace    | 项目源码 | 文件源码
def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):

  if not gfile.Exists(target_path):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with gfile.GFile(data_path, mode="rb") as data_file:
      with gfile.GFile(target_path, mode="w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 100000 == 0:
            print("  tokenizing line %d" % counter)
          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                            normalize_digits)
          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
项目:PlantImageRecognition    作者:HeavenMin    | 项目源码 | 文件源码
def createBottleneckFile(bottleneckPath, imageLists, labelName, index,
                         imageDir, category, sess, jpegDataTensor,
                         bottleneckTensor):
    print('Create bottleneck at ' + bottleneckPath)
    imagePath = getImagePath(imageLists, labelName, index,
                             imageDir, category)
    if not gfile.Exists(imagePath):
        tf.logging.fatal('File not exist %s', imagePath)
    imageData = gfile.FastGFile(imagePath, 'rb').read()
    try:
        bottleneckValues = runBottleneckOnImage(sess,
                           imageData, jpegDataTensor, bottleneckTensor)
    except:
        pass

    bottleneckString = ','.join(str(x) for x in bottleneckValues)
    with open(bottleneckPath, 'w') as f:
        f.write(bottleneckString)
项目:PlantImageRecognition    作者:HeavenMin    | 项目源码 | 文件源码
def getRandomDistortedBottlenecks(sess, imageLists, num, category, imageDir,
                                  inputJpegTensor, distortedImage,
                                  resizedInputTensor, bottleneckTensor):
    classCount = len(imageLists.keys())
    bottlenecks = []
    groundTruths = []
    for _ in range(num):
        labelIndex = random.randrange(classCount)
        labelName = list(imageLists.keys())[labelIndex]
        imageIndex = random.randrange(MAX_NUM_IMAGES_PER_CLASS + 1)
        imagePath = getImagePath(imageLists, labelName, imageIndex,
                                 imageDir, category)
        if not gfile.Exists(imagePath):
            tf.logging.fatal('File not exist %s', imagePath)
        jpegData = gfile.FastGFile(imagePath, 'rb').read()
        distortedImageData = sess.run(distortedImage, {inputJpegTensor: jpegData})
        bottleneck = runBottleneckOnImage(sess, distortedImageData,
                                          resizedInputTensor, bottleneckTensor)
        groundTruth = np.zeros(classCount, dtype = np.float32)
        groundTruth[labelIndex] = 1.0
        bottlenecks.append(bottleneck)
        groundTruths.append(groundTruth)
    return bottlenecks, groundTruths
项目:PlantImageRecognition    作者:HeavenMin    | 项目源码 | 文件源码
def createBottleneckFile(bottleneckPath, imageLists, labelName, index,
                         imageDir, category, sess, jpegDataTensor,
                         bottleneckTensor):
    print('Create bottleneck at ' + bottleneckPath)
    imagePath = getImagePath(imageLists, labelName, index,
                             imageDir, category)
    if not gfile.Exists(imagePath):
        tf.logging.fatal('File not exist %s', imagePath)
    imageData = gfile.FastGFile(imagePath, 'rb').read()
    try:
        bottleneckValues = runBottleneckOnImage(sess,
                           imageData, jpegDataTensor, bottleneckTensor)
    except:
        pass

    bottleneckString = ','.join(str(x) for x in bottleneckValues)
    with open(bottleneckPath, 'w') as f:
        f.write(bottleneckString)
项目:PlantImageRecognition    作者:HeavenMin    | 项目源码 | 文件源码
def createBottleneckFile(bottleneckPath, imageLists, labelName, index,
                         imageDir, category, sess, jpegDataTensor,
                         bottleneckTensor):
    print('Create bottleneck at ' + bottleneckPath)
    imagePath = getImagePath(imageLists, labelName, index,
                             imageDir, category)
    if not gfile.Exists(imagePath):
        tf.logging.fatal('File not exist %s', imagePath)
    imageData = gfile.FastGFile(imagePath, 'rb').read()
    try:
        bottleneckValues = runBottleneckOnImage(sess,
                           imageData, jpegDataTensor, bottleneckTensor)
    except:
        pass

    bottleneckString = ','.join(str(x) for x in bottleneckValues)
    with open(bottleneckPath, 'w') as f:
        f.write(bottleneckString)
项目:PlantImageRecognition    作者:HeavenMin    | 项目源码 | 文件源码
def getRandomDistortedBottlenecks(sess, imageLists, num, category, imageDir,
                                  inputJpegTensor, distortedImage,
                                  resizedInputTensor, bottleneckTensor):
    classCount = len(imageLists.keys())
    bottlenecks = []
    groundTruths = []
    for _ in range(num):
        labelIndex = random.randrange(classCount)
        labelName = list(imageLists.keys())[labelIndex]
        imageIndex = random.randrange(MAX_NUM_IMAGES_PER_CLASS + 1)
        imagePath = getImagePath(imageLists, labelName, imageIndex,
                                 imageDir, category)
        if not gfile.Exists(imagePath):
            tf.logging.fatal('File not exist %s', imagePath)
        jpegData = gfile.FastGFile(imagePath, 'rb').read()
        distortedImageData = sess.run(distortedImage, {inputJpegTensor: jpegData})
        bottleneck = runBottleneckOnImage(sess, distortedImageData,
                                          resizedInputTensor, bottleneckTensor)
        groundTruth = np.zeros(classCount, dtype = np.float32)
        groundTruth[labelIndex] = 1.0
        bottlenecks.append(bottleneck)
        groundTruths.append(groundTruth)
    return bottlenecks, groundTruths
项目:PlantImageRecognition    作者:HeavenMin    | 项目源码 | 文件源码
def createBottleneckFile(bottleneckPath, imageLists, labelName, index,
                         imageDir, category, sess, jpegDataTensor,
                         bottleneckTensor):
    print('Create bottleneck at ' + bottleneckPath)
    imagePath = getImagePath(imageLists, labelName, index,
                             imageDir, category)
    if not gfile.Exists(imagePath):
        tf.logging.fatal('File not exist %s', imagePath)
    imageData = gfile.FastGFile(imagePath, 'rb').read()
    try:
        bottleneckValues = runBottleneckOnImage(sess,
                           imageData, jpegDataTensor, bottleneckTensor)
    except:
        pass

    bottleneckString = ','.join(str(x) for x in bottleneckValues)
    with open(bottleneckPath, 'w') as f:
        f.write(bottleneckString)
项目:PlantImageRecognition    作者:HeavenMin    | 项目源码 | 文件源码
def createBottleneckFile(bottleneckPath, imageLists, labelName, index,
                         imageDir, category, sess, jpegDataTensor,
                         bottleneckTensor):
    print('Create bottleneck at ' + bottleneckPath)
    imagePath = getImagePath(imageLists, labelName, index,
                             imageDir, category)
    if not gfile.Exists(imagePath):
        tf.logging.fatal('File not exist %s', imagePath)
    imageData = gfile.FastGFile(imagePath, 'rb').read()
    try:
        bottleneckValues = runBottleneckOnImage(sess,
                           imageData, jpegDataTensor, bottleneckTensor)
    except:
        pass

    bottleneckString = ','.join(str(x) for x in bottleneckValues)
    with open(bottleneckPath, 'w') as f:
        f.write(bottleneckString)
项目:PlantImageRecognition    作者:HeavenMin    | 项目源码 | 文件源码
def getRandomDistortedBottlenecks(sess, imageLists, num, category, imageDir,
                                  inputJpegTensor, distortedImage,
                                  resizedInputTensor, bottleneckTensor):
    classCount = len(imageLists.keys())
    bottlenecks = []
    groundTruths = []
    for _ in range(num):
        labelIndex = random.randrange(classCount)
        labelName = list(imageLists.keys())[labelIndex]
        imageIndex = random.randrange(MAX_NUM_IMAGES_PER_CLASS + 1)
        imagePath = getImagePath(imageLists, labelName, imageIndex,
                                 imageDir, category)
        if not gfile.Exists(imagePath):
            tf.logging.fatal('File not exist %s', imagePath)
        jpegData = gfile.FastGFile(imagePath, 'rb').read()
        distortedImageData = sess.run(distortedImage, {inputJpegTensor: jpegData})
        bottleneck = runBottleneckOnImage(sess, distortedImageData,
                                          resizedInputTensor, bottleneckTensor)
        groundTruth = np.zeros(classCount, dtype = np.float32)
        groundTruth[labelIndex] = 1.0
        bottlenecks.append(bottleneck)
        groundTruths.append(groundTruth)
    return bottlenecks, groundTruths
项目:neural-chat    作者:henriblancke    | 项目源码 | 文件源码
def data_to_token_ids(data_path, target_path, vocabulary_path):
    """Tokenize preprocess file and turn into token-ids using given vocabulary file.

    This function loads preprocess line-by-line from data_path, calls the above
    sentence_to_token_ids, and saves the result to target_path. See comment
    for sentence_to_token_ids on the details of token-ids format.

    Args:
      data_path: path to the preprocess file in one-sentence-per-line format.
      target_path: path where the file with token-ids will be created.
      vocabulary_path: path to the vocabulary file.
        if None, basic_tokenizer will be used.
    """
    if not gfile.Exists(target_path):
        print("Tokenizing preprocess in %s" % data_path)

        vocab, _ = initialize_vocabulary(vocabulary_path)
        with gfile.GFile(data_path, mode="r") as data_file:
            with gfile.GFile(target_path, mode="w") as tokens_file:
                for line in tqdm(data_file):
                    token_ids = sentence_to_token_ids(line, vocab)
                    tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
项目:lsdc    作者:febert    | 项目源码 | 文件源码
def testPathsWithParse(self):
    base_dir = os.path.join(tf.test.get_temp_dir(), "paths_parse")
    self.assertFalse(gfile.Exists(base_dir))
    for p in xrange(3):
      gfile.MakeDirs(os.path.join(base_dir, "%d" % p))
    # add a base_directory to ignore
    gfile.MakeDirs(os.path.join(base_dir, "ignore"))

    # create a simple parser that pulls the export_version from the directory.
    def parser(path):
      match = re.match("^" + base_dir + "/(\\d+)$", path.path)
      if not match:
        return None
      return path._replace(export_version=int(match.group(1)))

    self.assertEquals(
        gc.get_paths(base_dir, parser=parser),
        [gc.Path(os.path.join(base_dir, "0"), 0),
         gc.Path(os.path.join(base_dir, "1"), 1),
         gc.Path(os.path.join(base_dir, "2"), 2)])
项目:lsdc    作者:febert    | 项目源码 | 文件源码
def maybe_download(filename, work_directory, source_url):
  """Download the data from source url, unless it's already here.

  Args:
      filename: string, name of the file in the directory.
      work_directory: string, path to working directory.
      source_url: url to download from if file doesn't exist.

  Returns:
      Path to resulting file.
  """
  if not gfile.Exists(work_directory):
    gfile.MakeDirs(work_directory)
  filepath = os.path.join(work_directory, filename)
  if not gfile.Exists(filepath):
    with tempfile.NamedTemporaryFile() as tmpfile:
      temp_file_name = tmpfile.name
      urllib.request.urlretrieve(source_url, temp_file_name)
      gfile.Copy(temp_file_name, filepath)
      with gfile.GFile(filepath) as f:
        size = f.size()
      print('Successfully downloaded', filename, size, 'bytes.')
  return filepath
项目:lsdc    作者:febert    | 项目源码 | 文件源码
def testPathsWithParse(self):
    base_dir = os.path.join(tf.test.get_temp_dir(), "paths_parse")
    self.assertFalse(gfile.Exists(base_dir))
    for p in xrange(3):
      gfile.MakeDirs(os.path.join(base_dir, "%d" % p))
    # add a base_directory to ignore
    gfile.MakeDirs(os.path.join(base_dir, "ignore"))

    # create a simple parser that pulls the export_version from the directory.
    def parser(path):
      match = re.match("^" + base_dir + "/(\\d+)$", path.path)
      if not match:
        return None
      return path._replace(export_version=int(match.group(1)))

    self.assertEquals(
        gc.get_paths(base_dir, parser=parser),
        [gc.Path(os.path.join(base_dir, "0"), 0),
         gc.Path(os.path.join(base_dir, "1"), 1),
         gc.Path(os.path.join(base_dir, "2"), 2)])
项目:lsdc    作者:febert    | 项目源码 | 文件源码
def maybe_download(filename, work_directory, source_url):
  """Download the data from source url, unless it's already here.

  Args:
      filename: string, name of the file in the directory.
      work_directory: string, path to working directory.
      source_url: url to download from if file doesn't exist.

  Returns:
      Path to resulting file.
  """
  if not gfile.Exists(work_directory):
    gfile.MakeDirs(work_directory)
  filepath = os.path.join(work_directory, filename)
  if not gfile.Exists(filepath):
    temp_file_name, _ = urlretrieve_with_retry(source_url)
    gfile.Copy(temp_file_name, filepath)
    with gfile.GFile(filepath) as f:
      size = f.size()
    print('Successfully downloaded', filename, size, 'bytes.')
  return filepath
项目:tensorflow-for-poets-2    作者:googlecodelabs    | 项目源码 | 文件源码
def create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
                           image_dir, category, sess, jpeg_data_tensor,
                           decoded_image_tensor, resized_input_tensor,
                           bottleneck_tensor):
  """Create a single bottleneck file."""
  tf.logging.info('Creating bottleneck at ' + bottleneck_path)
  image_path = get_image_path(image_lists, label_name, index,
                              image_dir, category)
  if not gfile.Exists(image_path):
    tf.logging.fatal('File does not exist %s', image_path)
  image_data = gfile.FastGFile(image_path, 'rb').read()
  try:
    bottleneck_values = run_bottleneck_on_image(
        sess, image_data, jpeg_data_tensor, decoded_image_tensor,
        resized_input_tensor, bottleneck_tensor)
  except Exception as e:
    raise RuntimeError('Error during processing file %s (%s)' % (image_path,
                                                                 str(e)))
  bottleneck_string = ','.join(str(x) for x in bottleneck_values)
  with open(bottleneck_path, 'w') as bottleneck_file:
    bottleneck_file.write(bottleneck_string)
项目:tensorflow_chatbot    作者:llSourcell    | 项目源码 | 文件源码
def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):

  if not gfile.Exists(target_path):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with gfile.GFile(data_path, mode="rb") as data_file:
      with gfile.GFile(target_path, mode="w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 100000 == 0:
            print("  tokenizing line %d" % counter)
          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                            normalize_digits)
          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
项目:weighted_DCNN_IQA    作者:HC-2016    | 项目源码 | 文件源码
def convert_to(x, y, z, filename):
    """Converts data to tfrecords.

    Args:
      :param x, y: list - [img1, img2, ...].
                    img: ndarray.
      :param name: str. 
    """
    if not gfile.Exists(filename):
        print('Writing', filename)
        writer = tf.python_io.TFRecordWriter(filename)
        for index in range(NUM_PER_IMAGE):
            image_x = x[index].tostring()
            image_y = y[index].tostring()
            example = tf.train.Example(features=tf.train.Features(feature={
                'label': _float_feature(z[index]),
                'image_x': _bytes_feature(image_x),
                'image_y': _bytes_feature(image_y)
            }))
            writer.write(example.SerializeToString())
        writer.close()
项目:weighted_DCNN_IQA    作者:HC-2016    | 项目源码 | 文件源码
def convert_to(x, y, z, filename):
    """Converts data to tfrecords.

    Args:
      :param x, y: list - [img1, img2, ...].
                    img: ndarray.
      :param name: str. 
    """
    if not gfile.Exists(filename):
        print('Writing', filename)
        writer = tf.python_io.TFRecordWriter(filename)
        for index in range(NUM_PER_IMAGE):
            image_x = x[index].tostring()
            image_y = y[index].tostring()
            example = tf.train.Example(features=tf.train.Features(feature={
                'label': _float_feature(z[index]),
                'image_x': _bytes_feature(image_x),
                'image_y': _bytes_feature(image_y)
            }))
            writer.write(example.SerializeToString())
        writer.close()
项目:chatbot    作者:bikash    | 项目源码 | 文件源码
def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):

  if not gfile.Exists(target_path):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with gfile.GFile(data_path, mode="rb") as data_file:
      with gfile.GFile(target_path, mode="w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 100000 == 0:
            print("  tokenizing line %d" % counter)
          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                            normalize_digits)
          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
项目:reslearn    作者:mackcmillion    | 项目源码 | 文件源码
def load_meanstddev(path):
    # load precomputed mean/stddev
    if not gfile.Exists(path):
        raise ValueError('Mean/stddev file not found.')

    assert gfile.Exists(path)
    mean_stddev_string = open(path, 'r').read().split('\n')
    mean_str = mean_stddev_string[0][1:-1].split(',')
    stddev_str = mean_stddev_string[1][1:-1].split(',')
    eigval_str = mean_stddev_string[2][1:-1].split(',')
    eigvecs_str = mean_stddev_string[3][1:-1].split(' ')

    mean = [float(mean_str[0]), float(mean_str[1]), float(mean_str[2])]
    stddev = [float(stddev_str[0]), float(stddev_str[1]), float(stddev_str[2])]
    eigvals = [float(eigval_str[0]), float(eigval_str[1]), float(eigval_str[2])]
    eigvecs = []
    for eigvec_str in eigvecs_str:
        eigvec = eigvec_str[1:-1].split(',')
        eigvecs.append([float(eigvec[0]), float(eigvec[1]), float(eigvec[2])])
    return mean, stddev, eigvals, eigvecs
项目:single-image-depth-estimation    作者:liuhyCV    | 项目源码 | 文件源码
def output_predict(depths, images, output_dir):
    print("output predict into %s" % output_dir)
    if not gfile.Exists(output_dir):
        gfile.MakeDirs(output_dir)
    for i, (image, depth) in enumerate(zip(images, depths)):
        pilimg = Image.fromarray(np.uint8(image))
        image_name = "%s/%05d_org.png" % (output_dir, i)
        pilimg.save(image_name)
        depth = depth.transpose(2, 0, 1)
        if np.max(depth) != 0:
            ra_depth = (depth/np.max(depth))*255.0
        else:
            ra_depth = depth*255.0
        depth_pil = Image.fromarray(np.uint8(ra_depth[0]), mode="L")
        depth_name = "%s/%05d_dep.png" % (output_dir, i)
        depth_pil.save(depth_name)
项目:easybot    作者:undersail    | 项目源码 | 文件源码
def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):

  if not gfile.Exists(target_path):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with gfile.GFile(data_path, mode="rb") as data_file:
      with gfile.GFile(target_path, mode="w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 100000 == 0:
            print("  tokenizing line %d" % counter)
          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                            normalize_digits)
          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
项目:tensorflow_seq2seq_chatbot    作者:higepon    | 项目源码 | 文件源码
def initialize_vocabulary(vocabulary_path):
  """Initialize vocabulary from file.

  We assume the vocabulary is stored one-item-per-line, so a file:
    dog
    cat
  will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
  also return the reversed-vocabulary ["dog", "cat"].

  Args:
    vocabulary_path: path to the file containing the vocabulary.

  Returns:
    a pair: the vocabulary (a dictionary mapping string to integers), and
    the reversed vocabulary (a list, which reverses the vocabulary mapping).

  Raises:
    ValueError: if the provided vocabulary_path does not exist.
  """
  if gfile.Exists(vocabulary_path):
    rev_vocab = []
    with gfile.GFile(vocabulary_path, mode="r") as f:
      rev_vocab.extend(f.readlines())
    rev_vocab = [line.strip() for line in rev_vocab]
    vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
    return vocab, rev_vocab
  else:
    raise ValueError("Vocabulary file %s not found.", vocabulary_path)
项目:tensorflow_seq2seq_chatbot    作者:higepon    | 项目源码 | 文件源码
def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):
  """Tokenize data file and turn into token-ids using given vocabulary file.

  This function loads data line-by-line from data_path, calls the above
  sentence_to_token_ids, and saves the result to target_path. See comment
  for sentence_to_token_ids on the details of token-ids format.

  Args:
    data_path: path to the data file in one-sentence-per-line format.
    target_path: path where the file with token-ids will be created.
    vocabulary_path: path to the vocabulary file.
    tokenizer: a function to use to tokenize each sentence;
      if None, basic_tokenizer will be used.
    normalize_digits: Boolean; if true, all digits are replaced by 0s.
  """
  if not gfile.Exists(target_path):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with gfile.GFile(data_path, mode="r") as data_file:
      with gfile.GFile(target_path, mode="w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 100000 == 0:
            print("  tokenizing line %d" % counter)
          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                            normalize_digits)
          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
项目:tensorflow_seq2seq_chatbot    作者:higepon    | 项目源码 | 文件源码
def initialize_vocabulary(vocabulary_path):
    if gfile.Exists(vocabulary_path):
        rev_vocab = []
        with gfile.GFile(vocabulary_path, mode="r") as f:
            rev_vocab.extend(f.readlines())
        rev_vocab = [line.strip() for line in rev_vocab]
        # Dictionary of (word, idx)
        vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
        return vocab, rev_vocab
    else:
        raise ValueError("Vocabulary file %s not found.", vocabulary_path)


# From https://github.com/1228337123/tensorflow-seq2seq-chatbot
项目:Biseq2Seq_NLG    作者:MaZhiyuanBUAA    | 项目源码 | 文件源码
def create_model(session, forward_only,batch_size=None):
  """Create translation model and initialize or load parameters in session."""
  model = seq2seq_model.Seq2SeqModel(
      vocab_size=FLAGS.vocab_size,
      embedding_dim=FLAGS.embedding_dim,
      buckets=BUCKETS,
      size=FLAGS.size,
      num_layers=FLAGS.num_layers,
      max_gradient_norm=FLAGS.max_gradient_norm,
      batch_size=FLAGS.batch_size if not batch_size else batch_size,
      learning_rate=FLAGS.learning_rate,
      learning_rate_decay_factor=FLAGS.learning_rate_decay_factor,
      use_lstm=True,
      forward_only=forward_only)

  ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
  #print('path:',ckpt.model_checkpoint_path)
  #print('gfile:',gfile.Exists(ckpt.model_checkpoint_path))
  #if ckpt and gfile.Exists(ckpt.model_checkpoint_path):
  if ckpt:
    print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
    model.saver.restore(session, ckpt.model_checkpoint_path)
  else:
    print("Created model with fresh parameters.")
    session.run(tf.global_variables_initializer())
  return model
项目:Biseq2Seq_NLG    作者:MaZhiyuanBUAA    | 项目源码 | 文件源码
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
                      tokenizer=None, normalize_digits=True):
  """Create vocabulary file (if it does not exist yet) from data file.

  Data file is assumed to contain one sentence per line. Each sentence is
  tokenized and digits are normalized (if normalize_digits is set).
  Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
  We write it to vocabulary_path in a one-token-per-line format, so that later
  token in the first line gets id=0, second line gets id=1, and so on.

  Args:
    vocabulary_path: path where the vocabulary will be created.
    data_path: data file that will be used to create vocabulary.
    max_vocabulary_size: limit on the size of the created vocabulary.
    tokenizer: a function to use to tokenize each data sentence;
      if None, basic_tokenizer will be used.
    normalize_digits: Boolean; if true, all digits are replaced by 0s.
  """
  if not gfile.Exists(vocabulary_path):
    print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
    vocab = {}
    #with gfile.GFile(data_path, mode="r") as f:
    with open(data_path,'rb') as f:
      counter = 0
      for line in f.readlines():
        counter += 1
        if counter % 100000 == 0:
          print("  processing line %d" % counter)
        tokens = tokenizer(line.decode('utf-8')) if tokenizer else basic_tokenizer(line.decode('utf-8'))
        for w in tokens:
          word = re.sub(_DIGIT_RE, "0", w) if normalize_digits else w
          if word in vocab:
            vocab[word] += 1
          else:
            vocab[word] = 1
      vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
      if len(vocab_list) > max_vocabulary_size:
        vocab_list = vocab_list[:max_vocabulary_size]
      with gfile.GFile(vocabulary_path, mode="w") as vocab_file:
        for w in vocab_list:
          vocab_file.write(w + "\n")
项目:Biseq2Seq_NLG    作者:MaZhiyuanBUAA    | 项目源码 | 文件源码
def initialize_vocabulary(vocabulary_path):
  """Initialize vocabulary from file.

  We assume the vocabulary is stored one-item-per-line, so a file:
    dog
    cat
  will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
  also return the reversed-vocabulary ["dog", "cat"].

  Args:
    vocabulary_path: path to the file containing the vocabulary.

  Returns:
    a pair: the vocabulary (a dictionary mapping string to integers), and
    the reversed vocabulary (a list, which reverses the vocabulary mapping).

  Raises:
    ValueError: if the provided vocabulary_path does not exist.
  """
  if gfile.Exists(vocabulary_path):
    rev_vocab = []

    with gfile.GFile(vocabulary_path, mode="r") as f:
      rev_vocab.extend(f.readlines())

    rev_vocab = [line.strip() for line in rev_vocab]
    vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
    return vocab, rev_vocab

  else:
    raise ValueError("Vocabulary file %s not found.", vocabulary_path)
项目:Biseq2Seq_NLG    作者:MaZhiyuanBUAA    | 项目源码 | 文件源码
def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):
  """Tokenize data file and turn into token-ids using given vocabulary file.

  This function loads data line-by-line from data_path, calls the above
  sentence_to_token_ids, and saves the result to target_path. See comment
  for sentence_to_token_ids on the details of token-ids format.

  Args:
    data_path: path to the data file in one-sentence-per-line format.
    target_path: path where the file with token-ids will be created.
    vocabulary_path: path to the vocabulary file.
    tokenizer: a function to use to tokenize each sentence;
      if None, basic_tokenizer will be used.
    normalize_digits: Boolean; if true, all digits are replaced by 0s.
  """
  if not gfile.Exists(target_path):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with gfile.GFile(data_path, mode="r") as data_file:
      with gfile.GFile(target_path, mode="w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 100000 == 0:
            print("  tokenizing line %d" % counter)
          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                            normalize_digits)
          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
项目:WassersteinGAN.tensorflow    作者:shekkizh    | 项目源码 | 文件源码
def create_image_lists(image_dir, testing_percentage=0.0, validation_percentage=0.0):
    """
    Code modified from tensorflow/tensorflow/examples/image_retraining
    """
    if not gfile.Exists(image_dir):
        print("Image directory '" + image_dir + "' not found.")
        return None
    training_images = []
    extensions = ['jpg', 'jpeg', 'JPG', 'JPEG']
    sub_dirs = [x[0] for x in os.walk(image_dir)]
    file_list = []

    for extension in extensions:
        file_glob = os.path.join(image_dir, '*.' + extension)
        file_list.extend(glob.glob(file_glob))

    if not file_list:
        print('No files found')
    else:
        # print "No. of files found: %d" % len(file_list)
        training_images.extend([f for f in file_list])

    random.shuffle(training_images)
    no_of_images = len(training_images)
    validation_offset = int(validation_percentage * no_of_images)
    validation_images = training_images[:validation_offset]
    test_offset = int(testing_percentage * no_of_images)
    testing_images = training_images[validation_offset:validation_offset + test_offset]
    training_images = training_images[validation_offset + test_offset:]

    result = {
        'train': training_images,
        'test': testing_images,
        'validation': validation_images,
    }
    return result
项目:basic-encoder-decoder    作者:pemywei    | 项目源码 | 文件源码
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, tokenizer=None):
    """Create vocabulary file from data file.
    Data file is assumed to contain one sentence per line. Each sentence is
    tokenized and digits are normalized (if normalize_digits is set).
    Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
    We write it to vocabulary_path in a one-token-per-line format, so that later
    token in the first line gets id=0, second line gets id=1, and so on.

    Args:
        vocabulary_path: path where the vocabulary will be created.
        data_path: data file that will be used to create vocabulary.
        tokenizer: a function to use to tokenize each data sentence;
        if None, basic_tokenizer will be used.
    """
    if not gfile.Exists(vocabulary_path):
        print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
        vocab = {}
        with gfile.GFile(data_path, mode="rb") as f:
            for line in f:
                tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)

                for word in tokens:
                    if word in vocab:
                        vocab[word] += 1
                    else:
                        vocab[word] = 1
            vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
            if len(vocab_list) > max_vocabulary_size:
                vocab_list = vocab_list[:max_vocabulary_size]
            with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
                for w in vocab_list:
                    vocab_file.write(w + b"\n")
项目:tf-translate    作者:chrislit    | 项目源码 | 文件源码
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
                      tokenizer=None, normalize_digits=True):
  """Create vocabulary file (if it does not exist yet) from data file.

  Data file is assumed to contain one sentence per line. Each sentence is
  tokenized and digits are normalized (if normalize_digits is set).
  Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
  We write it to vocabulary_path in a one-token-per-line format, so that later
  token in the first line gets id=0, second line gets id=1, and so on.

  Args:
    vocabulary_path: path where the vocabulary will be created.
    data_path: data file that will be used to create vocabulary.
    max_vocabulary_size: limit on the size of the created vocabulary.
    tokenizer: a function to use to tokenize each data sentence;
      if None, basic_word_tokenizer will be used.
    normalize_digits: Boolean; if true, all digits are replaced by 0s.
  """
  if not gfile.Exists(vocabulary_path):
    print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
    vocab = {}
    with gfile.GFile(data_path, mode="rb") as f:
      counter = 0
      for line in f:
        counter += 1
        if counter % 100000 == 0:
          print("  processing line %d" % counter)
        tokens = tokenizer(line) if tokenizer else basic_word_tokenizer(line)
        for w in tokens:
          word = re.sub(_DIGIT_RE, b"0", w) if normalize_digits else w
          if word in vocab:
            vocab[word] += 1
          else:
            vocab[word] = 1
      vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
      if len(vocab_list) > max_vocabulary_size:
        vocab_list = vocab_list[:max_vocabulary_size]
      with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
        for w in vocab_list:
          vocab_file.write(w + b"\n")
项目:tf-translate    作者:chrislit    | 项目源码 | 文件源码
def initialize_vocabulary(vocabulary_path):
  """Initialize vocabulary from file.

  We assume the vocabulary is stored one-item-per-line, so a file:
    dog
    cat
  will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
  also return the reversed-vocabulary ["dog", "cat"].

  Args:
    vocabulary_path: path to the file containing the vocabulary.

  Returns:
    a pair: the vocabulary (a dictionary mapping string to integers), and
    the reversed vocabulary (a list, which reverses the vocabulary mapping).

  Raises:
    ValueError: if the provided vocabulary_path does not exist.
  """
  if gfile.Exists(vocabulary_path):
    rev_vocab = []
    with gfile.GFile(vocabulary_path, mode="rb") as f:
      rev_vocab.extend(f.readlines())
    rev_vocab = [line.strip() for line in rev_vocab]
    vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
    return vocab, rev_vocab
  else:
    raise ValueError("Vocabulary file %s not found.", vocabulary_path)
项目:tf-translate    作者:chrislit    | 项目源码 | 文件源码
def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):
  """Tokenize data file and turn into token-ids using given vocabulary file.

  This function loads data line-by-line from data_path, calls the above
  sentence_to_token_ids, and saves the result to target_path. See comment
  for sentence_to_token_ids on the details of token-ids format.

  Args:
    data_path: path to the data file in one-sentence-per-line format.
    target_path: path where the file with token-ids will be created.
    vocabulary_path: path to the vocabulary file.
    tokenizer: a function to use to tokenize each sentence;
      if None, basic_word_tokenizer will be used.
    normalize_digits: Boolean; if true, all digits are replaced by 0s.
  """
  if not gfile.Exists(target_path):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with gfile.GFile(data_path, mode="rb") as data_file:
      with gfile.GFile(target_path, mode="w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 100000 == 0:
            print("  tokenizing line %d" % counter)
          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                            normalize_digits)
          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
项目:tf-seq2seq-mod    作者:knok    | 项目源码 | 文件源码
def get_wmt_enfr_train_set(directory):
  """Download the WMT en-fr training corpus to directory unless it's there."""
  train_path = os.path.join(directory, "giga-fren.release2.fixed")
  if not (gfile.Exists(train_path +".fr") and gfile.Exists(train_path +".en")):
    corpus_file = maybe_download(directory, "training-giga-fren.tar",
                                 _WMT_ENFR_TRAIN_URL)
    print("Extracting tar file %s" % corpus_file)
    with tarfile.open(corpus_file, "r") as corpus_tar:
      corpus_tar.extractall(directory)
    gunzip_file(train_path + ".fr.gz", train_path + ".fr")
    gunzip_file(train_path + ".en.gz", train_path + ".en")
  return train_path
项目:tf-seq2seq-mod    作者:knok    | 项目源码 | 文件源码
def get_wmt_enfr_dev_set(directory):
  """Download the WMT en-fr training corpus to directory unless it's there."""
  dev_name = "newstest2013"
  dev_path = os.path.join(directory, dev_name)
  if not (gfile.Exists(dev_path + ".fr") and gfile.Exists(dev_path + ".en")):
    dev_file = maybe_download(directory, "dev-v2.tgz", _WMT_ENFR_DEV_URL)
    print("Extracting tgz file %s" % dev_file)
    with tarfile.open(dev_file, "r:gz") as dev_tar:
      fr_dev_file = dev_tar.getmember("dev/" + dev_name + ".fr")
      en_dev_file = dev_tar.getmember("dev/" + dev_name + ".en")
      fr_dev_file.name = dev_name + ".fr"  # Extract without "dev/" prefix.
      en_dev_file.name = dev_name + ".en"
      dev_tar.extract(fr_dev_file, directory)
      dev_tar.extract(en_dev_file, directory)
  return dev_path
项目:tf-seq2seq-mod    作者:knok    | 项目源码 | 文件源码
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
                      tokenizer=None, normalize_digits=True):
  """Create vocabulary file (if it does not exist yet) from data file.

  Data file is assumed to contain one sentence per line. Each sentence is
  tokenized and digits are normalized (if normalize_digits is set).
  Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
  We write it to vocabulary_path in a one-token-per-line format, so that later
  token in the first line gets id=0, second line gets id=1, and so on.

  Args:
    vocabulary_path: path where the vocabulary will be created.
    data_path: data file that will be used to create vocabulary.
    max_vocabulary_size: limit on the size of the created vocabulary.
    tokenizer: a function to use to tokenize each data sentence;
      if None, basic_tokenizer will be used.
    normalize_digits: Boolean; if true, all digits are replaced by 0s.
  """
  if not gfile.Exists(vocabulary_path):
    print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
    vocab = {}
    with gfile.GFile(data_path, mode="rb") as f:
      counter = 0
      for line in f:
        counter += 1
        if counter % 100000 == 0:
          print("  processing line %d" % counter)
        line = tf.compat.as_bytes(line)
        tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
        for w in tokens:
          word = _DIGIT_RE.sub(b"0", w) if normalize_digits else w
          if word in vocab:
            vocab[word] += 1
          else:
            vocab[word] = 1
      vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
      if len(vocab_list) > max_vocabulary_size:
        vocab_list = vocab_list[:max_vocabulary_size]
      with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
        for w in vocab_list:
          vocab_file.write(w + b"\n")