Python tensorflow 模块,string_split() 实例源码

我们从Python开源项目中,提取了以下33个代码示例,用于说明如何使用tensorflow.string_split()

项目:transform    作者:tensorflow    | 项目源码 | 文件源码
def testNGramsWithSpaceSeparator(self):
    string_tensor = tf.constant(['One was Johnny', 'Two was a rat'])
    tokenized_tensor = tf.string_split(string_tensor, delimiter=' ')
    output_tensor = mappers.ngrams(
        tokens=tokenized_tensor,
        ngram_range=(1, 2),
        separator=' ')
    with tf.Session():
      output = output_tensor.eval()
      self.assertAllEqual(
          output.indices,
          [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4],
           [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
      self.assertAllEqual(output.values, [
          'One', 'One was', 'was', 'was Johnny', 'Johnny',
          'Two', 'Two was', 'was', 'was a', 'a', 'a rat', 'rat'])
      self.assertAllEqual(output.dense_shape, [2, 7])
项目:ChessAI    作者:SamRagusa    | 项目源码 | 文件源码
def full_onehot_process_line_as_2d_input(the_str, num_samples=-1):
    with tf.name_scope("process_data_2d"):
        #with tf.device("/cpu:0"):

        # A tensor referenced when getting indices of characters for the the_values array
        mapping_strings = tf.constant(
            ["0", "1", "K", "Q", "R", "B", "N", "P", "C", "k", "q", "r", "b", "n", "p", "c"])

        number_of_mapping_strings = 16  # len(mapping_strings)
        the_values = tf.constant(
            [[1 if i == j else 0 for i in range(number_of_mapping_strings)] for j in range(number_of_mapping_strings)],
            dtype=tf.float32)

        # Create the table for getting indices (for the_values) from the information about the board
        the_table = tf.contrib.lookup.index_table_from_tensor(mapping=mapping_strings, name="index_lookup_table")

        data = tf.reshape(
            # Get the values at the given indices
            tf.gather(
                the_values,
                # Get an array of indices corresponding to the array of characters
                the_table.lookup(
                    # Split the string into an array of characters
                    tf.string_split(
                        [the_str],
                        delimiter="").values)),
            [num_samples, 64, number_of_mapping_strings]) #THIS SHOULD REALLY BE [3x8x8,num_mapping_strings]

        return data
项目:seq2seq    作者:google    | 项目源码 | 文件源码
def decode(self, data, items):
    decoded_items = {}

    # Split tokens
    tokens = tf.string_split([data], delimiter=self.delimiter).values

    # Optionally prepend a special token
    if self.prepend_token is not None:
      tokens = tf.concat([[self.prepend_token], tokens], 0)

    # Optionally append a special token
    if self.append_token is not None:
      tokens = tf.concat([tokens, [self.append_token]], 0)

    decoded_items[self.length_feature_name] = tf.size(tokens)
    decoded_items[self.tokens_feature_name] = tokens
    return [decoded_items[_] for _ in items]
项目:tf-crnn    作者:solivr    | 项目源码 | 文件源码
def image_reading(path: str, resized_size: Tuple[int, int]=None, data_augmentation: bool=False,
                  padding: bool=False) -> Tuple[tf.Tensor, tf.Tensor]:
    # Read image
    image_content = tf.read_file(path, name='image_reader')
    image = tf.cond(tf.equal(tf.string_split([path], '.').values[1], tf.constant('jpg', dtype=tf.string)),
                    true_fn=lambda: tf.image.decode_jpeg(image_content, channels=1, try_recover_truncated=True), # TODO channels = 3 ?
                    false_fn=lambda: tf.image.decode_png(image_content, channels=1), name='image_decoding')

    # Data augmentation
    if data_augmentation:
        image = augment_data(image)

    # Padding
    if padding:
        with tf.name_scope('padding'):
            image, img_width = padding_inputs_width(image, resized_size, increment=CONST.DIMENSION_REDUCTION_W_POOLING)
    # Resize
    else:
        image = tf.image.resize_images(image, size=resized_size)
        img_width = tf.shape(image)[1]

    with tf.control_dependencies([tf.assert_equal(image.shape[:2], resized_size)]):
        return image, img_width
项目:conv_seq2seq    作者:tobyyouup    | 项目源码 | 文件源码
def decode(self, data, items):
    decoded_items = {}

    # Split tokens
    tokens = tf.string_split([data], delimiter=self.delimiter).values

    # Optionally prepend a special token
    if self.prepend_token is not None:
      tokens = tf.concat([[self.prepend_token], tokens], 0)

    # Optionally append a special token
    if self.append_token is not None:
      tokens = tf.concat([tokens, [self.append_token]], 0)

    decoded_items[self.length_feature_name] = tf.size(tokens)
    decoded_items[self.tokens_feature_name] = tokens
    return [decoded_items[_] for _ in items]
项目:transform    作者:tensorflow    | 项目源码 | 文件源码
def testTFIDFNoData(self):
    def preprocessing_fn(inputs):
      inputs_as_ints = tft.string_to_int(tf.string_split(inputs['a']))
      out_index, out_values = tft.tfidf(inputs_as_ints, 6)
      return {
          'tf_idf': out_values,
          'index': out_index
      }
    input_data = [{'a': ''}]
    input_schema = dataset_metadata.DatasetMetadata({
        'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
    })
    expected_transformed_data = [{'tf_idf': [], 'index': []}]
    expected_transformed_schema = dataset_metadata.DatasetMetadata({
        'tf_idf': sch.ColumnSchema(tf.float32, [None],
                                   sch.ListColumnRepresentation()),
        'index': sch.ColumnSchema(tf.int64, [None],
                                  sch.ListColumnRepresentation())
    })
    self.assertAnalyzeAndTransformResults(
        input_data, input_schema, preprocessing_fn, expected_transformed_data,
        expected_transformed_schema)
项目:transform    作者:tensorflow    | 项目源码 | 文件源码
def testUniquesAnalyzerWithTokenization(self):
    def preprocessing_fn(inputs):
      return {
          'index': tft.string_to_int(tf.string_split(inputs['a']))
      }

    input_data = [{'a': 'hello hello world'}, {'a': 'hello goodbye world'}]
    input_metadata = dataset_metadata.DatasetMetadata({
        'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
    })
    expected_data = [{'index': [0, 0, 1]}, {'index': [0, 2, 1]}]
    expected_metadata = dataset_metadata.DatasetMetadata({
        'index': sch.ColumnSchema(
            sch.IntDomain(tf.int64, -1, 2, True,
                          'vocab_string_to_int_uniques'),
            [None], sch.ListColumnRepresentation())
    })
    self.assertAnalyzeAndTransformResults(
        input_data, input_metadata, preprocessing_fn, expected_data,
        expected_metadata)
项目:automatic-summarization    作者:mozilla    | 项目源码 | 文件源码
def decode(self, data, items):
    decoded_items = {}

    # Split tokens
    tokens = tf.string_split([data], delimiter=self.delimiter).values

    # Optionally prepend a special token
    if self.prepend_token is not None:
      tokens = tf.concat([[self.prepend_token], tokens], 0)

    # Optionally append a special token
    if self.append_token is not None:
      tokens = tf.concat([tokens, [self.append_token]], 0)

    decoded_items[self.length_feature_name] = tf.size(tokens)
    decoded_items[self.tokens_feature_name] = tokens
    return [decoded_items[_] for _ in items]
项目:polyaxon    作者:polyaxon    | 项目源码 | 文件源码
def decode(self, data, items):
        decoded_items = {}

        # Split tokens
        tokens = tf.string_split([data], delimiter=self.delimiter).values

        # Optionally prepend a special token
        if self.prepend_token is not None:
            tokens = tf.concat([[self.prepend_token], tokens], 0)

        # Optionally append a special token
        if self.append_token is not None:
            tokens = tf.concat([tokens, [self.append_token]], 0)

        decoded_items[self.length_feature_name] = tf.size(tokens)
        decoded_items[self.tokens_feature_name] = tokens
        return [decoded_items[_] for _ in items]
项目:cloudml-samples    作者:GoogleCloudPlatform    | 项目源码 | 文件源码
def make_preprocessing_fn(frequency_threshold):
  """Creates a preprocessing function for reddit.

  Args:
    frequency_threshold: The frequency_threshold used when generating
      vocabularies for categorical and text features.

  Returns:
    A preprocessing function.
  """

  def preprocessing_fn(inputs):
    """User defined preprocessing function for reddit columns.

    Args:
      inputs: dictionary of input `tensorflow_transform.Column`.
    Returns:
      A dictionary of `tensorflow_transform.Column` representing the transformed
          columns.
    """
    # TODO(b/35001605) Make this "passthrough" more DRY.
    result = {'score': inputs['score'], 'toplevel': inputs['toplevel']}

    result['subreddit_id'] = tft.string_to_int(
        inputs['subreddit'], frequency_threshold=frequency_threshold)

    for name in ('author', 'comment_body', 'comment_parent_body'):
      words = tf.string_split(inputs[name])
      # TODO(b/33467613) Translate these to bag-of-words style sparse features.
      result[name + '_bow'] = tft.string_to_int(
          words, frequency_threshold=frequency_threshold)

    return result

  return preprocessing_fn
项目:HyperGAN    作者:255BITS    | 项目源码 | 文件源码
def __init__(self, config, batch_size, one_hot=False):
        self.lookup = None
        reader = tf.TextLineReader()
        filename_queue = tf.train.string_input_producer(["chargan.txt"])
        key, x = reader.read(filename_queue)
        vocabulary = self.get_vocabulary()

        table = tf.contrib.lookup.string_to_index_table_from_tensor(
            mapping = vocabulary, default_value = 0)

        x = tf.string_join([x, tf.constant(" " * 64)]) 
        x = tf.substr(x, [0], [64])
        x = tf.string_split(x,delimiter='')
        x = tf.sparse_tensor_to_dense(x, default_value=' ')
        x = tf.reshape(x, [64])
        x = table.lookup(x)
        self.one_hot = one_hot
        if one_hot:
            x = tf.one_hot(x, len(vocabulary))
            x = tf.cast(x, dtype=tf.float32)
            x = tf.reshape(x, [1, int(x.get_shape()[0]), int(x.get_shape()[1]), 1])
        else:
            x = tf.cast(x, dtype=tf.float32)
            x -= len(vocabulary)/2.0
            x /= len(vocabulary)/2.0
            x = tf.reshape(x, [1,1, 64, 1])

        num_preprocess_threads = 8

        x = tf.train.shuffle_batch(
          [x],
          batch_size=batch_size,
          num_threads=num_preprocess_threads,
          capacity= 5000,
          min_after_dequeue=500,
          enqueue_many=True)

        self.x = x
        self.table = table
项目:transform    作者:tensorflow    | 项目源码 | 文件源码
def testStringToTFIDFEmptyDoc(self):
    def preprocessing_fn(inputs):
      inputs_as_ints = tft.string_to_int(tf.string_split(inputs['a']))
      out_index, out_values = tft.tfidf(inputs_as_ints, 6)
      return {
          'tf_idf': out_values,
          'index': out_index
      }
    input_data = [{'a': 'hello hello world'},
                  {'a': ''},
                  {'a': 'hello goodbye hello world'},
                  {'a': 'I like pie pie pie'}]
    input_schema = dataset_metadata.DatasetMetadata({
        'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
    })

    log_5_over_2 = 1.91629073187
    log_5_over_3 = 1.51082562376
    expected_transformed_data = [{
        'tf_idf': [(2/3)*log_5_over_3, (1/3)*log_5_over_3],
        'index': [0, 2]
    }, {
        'tf_idf': [],
        'index': []
    }, {
        'tf_idf': [(2/4)*log_5_over_3, (1/4)*log_5_over_3, (1/4)*log_5_over_2],
        'index': [0, 2, 4]
    }, {
        'tf_idf': [(3/5)*log_5_over_2, (1/5)*log_5_over_2, (1/5)*log_5_over_2],
        'index': [1, 3, 5]
    }]
    expected_transformed_schema = dataset_metadata.DatasetMetadata({
        'tf_idf': sch.ColumnSchema(tf.float32, [None],
                                   sch.ListColumnRepresentation()),
        'index': sch.ColumnSchema(tf.int64, [None],
                                  sch.ListColumnRepresentation())
    })
    self.assertAnalyzeAndTransformResults(
        input_data, input_schema, preprocessing_fn, expected_transformed_data,
        expected_transformed_schema)
项目:transform    作者:tensorflow    | 项目源码 | 文件源码
def testUniquesAnalyzerWithHighFrequencyThresholdAndOOVBuckets(self):
    def preprocessing_fn(inputs):
      return {
          'index1':
              tft.string_to_int(
                  tf.string_split(inputs['a']),
                  default_value=-99,
                  top_k=1,
                  num_oov_buckets=3)
      }

    input_data = [
        {'a': 'hello hello world world'},
        {'a': 'hello tarkus toccata'},
        {'a': 'hello goodbye foo'}
    ]
    input_metadata = dataset_metadata.DatasetMetadata({
        'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
    })
    # Generated vocab (ordered by frequency, then value) should be:
    # ["hello", "world", "goodbye", "foo", "tarkus", "toccata"]. After applying
    # top_k =1 this becomes ["hello"] plus three OOV buckets.
    # The specific output values here depend on the hash of the words, and the
    # test will break if the hash changes.
    expected_data = [
        {'index1': [0, 0, 2, 2]},
        {'index1': [0, 3, 1]},
        {'index1': [0, 2, 1]},
    ]
    expected_metadata = dataset_metadata.DatasetMetadata({
        'index1': sch.ColumnSchema(
            sch.IntDomain(tf.int64, 0, 3, True,
                          'vocab_string_to_int_uniques'), [None],
            sch.ListColumnRepresentation()),
    })
    self.assertAnalyzeAndTransformResults(
        input_data, input_metadata, preprocessing_fn, expected_data,
        expected_metadata)
项目:transform    作者:tensorflow    | 项目源码 | 文件源码
def testNGramsEmpty(self):
    output_tensor = mappers.ngrams(tf.string_split(tf.constant([''])),
                                   (1, 5), '')
    with tf.Session():
      output = output_tensor.eval()
      self.assertEqual((0, 2), output.indices.shape)
      self.assertAllEqual([1, 0], output.dense_shape)
      self.assertEqual(0, len(output.values))
项目:transform    作者:tensorflow    | 项目源码 | 文件源码
def testNGrams(self):
    string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', ''])
    tokenized_tensor = tf.string_split(string_tensor, delimiter='')
    output_tensor = mappers.ngrams(
        tokens=tokenized_tensor,
        ngram_range=(1, 5),
        separator='')
    self.assertSparseOutput(
        expected_indices=[
            [0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5],
            [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5],
            [2, 0], [2, 1], [2, 2], [2, 3], [2, 4], [2, 5], [2, 6], [2, 7],
            [2, 8], [2, 9], [2, 10], [2, 11], [2, 12], [2, 13], [2, 14],
            [2, 15], [2, 16], [2, 17], [2, 18], [2, 19], [2, 20], [2, 21],
            [2, 22], [2, 23], [2, 24], [2, 25], [2, 26], [2, 27], [2, 28],
            [2, 29], [3, 0]],
        expected_values=[
            'a', 'ab', 'abc', 'b', 'bc', 'c',
            'd', 'de', 'def', 'e', 'ef', 'f',
            'f', 'fg', 'fgh', 'fghi', 'fghij', 'g', 'gh', 'ghi', 'ghij',
            'ghijk', 'h', 'hi', 'hij', 'hijk', 'hijkl', 'i', 'ij', 'ijk',
            'ijkl', 'ijklm', 'j', 'jk', 'jkl', 'jklm', 'k', 'kl', 'klm', 'l',
            'lm', 'm', 'z'],
        expected_shape=[5, 30],
        actual_sparse_tensor=output_tensor,
        close_values=False)
项目:transform    作者:tensorflow    | 项目源码 | 文件源码
def testNGramsBadSizes(self):
    string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', ''])
    tokenized_tensor = tf.string_split(string_tensor, delimiter='')
    with self.assertRaisesRegexp(ValueError, 'Invalid ngram_range'):
      mappers.ngrams(tokenized_tensor, (0, 5), separator='')
    with self.assertRaisesRegexp(ValueError, 'Invalid ngram_range'):
      mappers.ngrams(tokenized_tensor, (6, 5), separator='')
项目:XMUNMT    作者:XMUNLP    | 项目源码 | 文件源码
def get_inference_input(inputs, params):
    dataset = tf.data.Dataset.from_tensor_slices(
        tf.constant(inputs)
    )

    # Split string
    dataset = dataset.map(lambda x: tf.string_split([x]).values,
                          num_parallel_calls=params.num_threads)

    # Append <eos>
    dataset = dataset.map(
        lambda x: tf.concat([x, [tf.constant(params.eos)]], axis=0),
        num_parallel_calls=params.num_threads
    )

    # Convert tuple to dictionary
    dataset = dataset.map(
        lambda x: {"source": x, "source_length": tf.shape(x)[0]},
        num_parallel_calls=params.num_threads
    )

    dataset = dataset.padded_batch(
        params.decode_batch_size,
        {"source": [tf.Dimension(None)], "source_length": []},
        {"source": params.pad, "source_length": 0}
    )

    iterator = dataset.make_one_shot_iterator()
    features = iterator.get_next()

    src_table = tf.contrib.lookup.index_table_from_tensor(
        tf.constant(params.vocabulary["source"]),
        default_value=params.mapping["source"][params.unk]
    )
    features["source"] = src_table.lookup(features["source"])

    return features
项目:GNMT2    作者:Mingyearn    | 项目源码 | 文件源码
def get_infer_iterator(
    src_dataset, src_vocab_table, batch_size,
    source_reverse, eos, src_max_len=None):
  src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32)
  src_dataset = src_dataset.map(lambda src: tf.string_split([src]).values)

  if src_max_len:
    src_dataset = src_dataset.map(lambda src: src[:src_max_len])
  # Convert the word strings to ids
  src_dataset = src_dataset.map(
      lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32))
  if source_reverse:
    src_dataset = src_dataset.map(lambda src: tf.reverse(src, axis=[0]))
  # Add in the word counts.
  src_dataset = src_dataset.map(lambda src: (src, tf.size(src)))

  def batching_func(x):
    return x.padded_batch(
        batch_size,
        # The entry is the source line rows;
        # this has unknown-length vectors.  The last entry is
        # the source row size; this is a scalar.
        padded_shapes=(tf.TensorShape([None]),  # src
                       tf.TensorShape([])),     # src_len
        # Pad the source sequences with eos tokens.
        # (Though notice we don't generally need to do this since
        # later on we will be masking out calculations past the true sequence.
        padding_values=(src_eos_id,  # src
                        0))          # src_len -- unused

  batched_dataset = batching_func(src_dataset)
  batched_iter = batched_dataset.make_initializable_iterator()
  (src_ids, src_seq_len) = batched_iter.get_next()
  return BatchedInput(
      initializer=batched_iter.initializer,
      source=src_ids,
      target_input=None,
      target_output=None,
      source_sequence_length=src_seq_len,
      target_sequence_length=None)
项目:attention    作者:louishenrifranc    | 项目源码 | 文件源码
def get_input_fn(batch_size, num_epochs, context_filename, answer_filename, max_sequence_len):
    def input_fn():
        source_dataset = tf.contrib.data.TextLineDataset(context_filename)
        target_dataset = tf.contrib.data.TextLineDataset(answer_filename)

        def map_dataset(dataset):
            dataset = dataset.map(lambda string: tf.string_split([string]).values)
            dataset = dataset.map(lambda token: tf.string_to_number(token, tf.int64))
            dataset = dataset.map(lambda tokens: (tokens, tf.size(tokens)))
            dataset = dataset.map(lambda tokens, size: (tokens[:max_sequence_len], tf.minimum(size, max_sequence_len)))
            return dataset

        source_dataset = map_dataset(source_dataset)
        target_dataset = map_dataset(target_dataset)

        dataset = tf.contrib.data.Dataset.zip((source_dataset, target_dataset))
        dataset = dataset.repeat(num_epochs)
        dataset = dataset.padded_batch(batch_size,
                                       padded_shapes=((tf.TensorShape([max_sequence_len]), tf.TensorShape([])),
                                                      (tf.TensorShape([max_sequence_len]), tf.TensorShape([]))
                                                      ))

        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()
        return next_element, None

    return input_fn
项目:THUMT    作者:thumt    | 项目源码 | 文件源码
def get_inference_input(inputs, params):
    dataset = tf.data.Dataset.from_tensor_slices(
        tf.constant(inputs)
    )

    # Split string
    dataset = dataset.map(lambda x: tf.string_split([x]).values,
                          num_parallel_calls=params.num_threads)

    # Append <eos>
    dataset = dataset.map(
        lambda x: tf.concat([x, [tf.constant(params.eos)]], axis=0),
        num_parallel_calls=params.num_threads
    )

    # Convert tuple to dictionary
    dataset = dataset.map(
        lambda x: {"source": x, "source_length": tf.shape(x)[0]},
        num_parallel_calls=params.num_threads
    )

    dataset = dataset.padded_batch(
        params.decode_batch_size,
        {"source": [tf.Dimension(None)], "source_length": []},
        {"source": params.pad, "source_length": 0}
    )

    iterator = dataset.make_one_shot_iterator()
    features = iterator.get_next()

    src_table = tf.contrib.lookup.index_table_from_tensor(
        tf.constant(params.vocabulary["source"]),
        default_value=params.mapping["source"][params.unk]
    )
    features["source"] = src_table.lookup(features["source"])

    return features
项目:nmt_v2    作者:rpryzant    | 项目源码 | 文件源码
def get_test_iterator(src_dataset, src_vocab_table, batch_size, config):
    src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(config.eos)), tf.int32)
    src_dataset = src_dataset.map(lambda src: tf.string_split([src]).values)

    src_dataset = src_dataset.map(lambda src: src[:config.src_max_len])

    src_dataset = src_dataset.map(
        lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32))

    if config.reverse_src:
        src_dataset = src_dataset.map(lambda src: tf.reverse(src, axis=[0]))

    src_dataset = src_dataset.map(lambda src: (src, tf.size(src)))

    def batching_func(x):
        return x.padded_batch(
            config.batch_size,
            padded_shapes=(tf.TensorShape([None]),
                           tf.TensorShape([])),
            padding_values=(src_eos_id,
                            0))

    batched_dataset = batching_func(src_dataset)
    batched_iter = batched_dataset.make_initializable_iterator()
    src_ids, src_seq_len = batched_iter.get_next()
    return BatchedInput(
        initializer=batched_iter.initializer,
        source=src_ids,
        target_input=None,
        target_output=None,
        source_sequence_length=src_seq_len,
        target_sequence_length=None)
项目:windbag    作者:tongda    | 项目源码 | 文件源码
def _read_id_file(path) -> Dataset:
  def _parse_line(line):
    splits = tf.string_split(tf.reshape(line, (-1,))).values
    return tf.string_to_number(splits, out_type=tf.int32)

  return TextLineDataset(path) \
    .filter(lambda line: tf.size(line) > 0) \
    .map(_parse_line)
项目:transform    作者:tensorflow    | 项目源码 | 文件源码
def testStringToTFIDF(self):
    def preprocessing_fn(inputs):
      inputs_as_ints = tft.string_to_int(tf.string_split(inputs['a']))
      out_index, out_values = tft.tfidf(inputs_as_ints, 6)
      return {
          'tf_idf': out_values,
          'index': out_index
      }
    input_data = [{'a': 'hello hello world'},
                  {'a': 'hello goodbye hello world'},
                  {'a': 'I like pie pie pie'}]
    input_schema = dataset_metadata.DatasetMetadata({
        'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
    })

    # IDFs
    # hello = log(4/3) = 0.28768
    # world = log(4/3)
    # goodbye = log(4/2) = 0.69314
    # I = log(4/2)
    # like = log(4/2)
    # pie = log(4/2)
    log_4_over_2 = 1.69314718056
    log_4_over_3 = 1.28768207245
    expected_transformed_data = [{
        'tf_idf': [(2/3)*log_4_over_3, (1/3)*log_4_over_3],
        'index': [0, 2]
    }, {
        'tf_idf': [(2/4)*log_4_over_3, (1/4)*log_4_over_3, (1/4)*log_4_over_2],
        'index': [0, 2, 4]
    }, {
        'tf_idf': [(3/5)*log_4_over_2, (1/5)*log_4_over_2, (1/5)*log_4_over_2],
        'index': [1, 3, 5]
    }]
    expected_transformed_schema = dataset_metadata.DatasetMetadata({
        'tf_idf': sch.ColumnSchema(tf.float32, [None],
                                   sch.ListColumnRepresentation()),
        'index': sch.ColumnSchema(tf.int64, [None],
                                  sch.ListColumnRepresentation())
    })
    self.assertAnalyzeAndTransformResults(
        input_data, input_schema, preprocessing_fn, expected_transformed_data,
        expected_transformed_schema)
项目:transform    作者:tensorflow    | 项目源码 | 文件源码
def testTFIDFWithOOV(self):
    test_vocab_size = 3
    def preprocessing_fn(inputs):
      inputs_as_ints = tft.string_to_int(tf.string_split(inputs['a']),
                                         top_k=test_vocab_size)
      out_index, out_values = tft.tfidf(inputs_as_ints,
                                        test_vocab_size+1)
      return {
          'tf_idf': out_values,
          'index': out_index
      }
    input_data = [{'a': 'hello hello world'},
                  {'a': 'hello goodbye hello world'},
                  {'a': 'I like pie pie pie'}]
    input_schema = dataset_metadata.DatasetMetadata({
        'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
    })

    # IDFs
    # hello = log(3/3) = 0
    # pie = log(3/2) = 0.4054651081
    # world = log(3/3) = 0
    # OOV - goodbye, I, like = log(3/3)
    log_4_over_2 = 1.69314718056
    log_4_over_3 = 1.28768207245
    expected_transformed_data = [{
        'tf_idf': [(2/3)*log_4_over_3, (1/3)*log_4_over_3],
        'index': [0, 2]
    }, {
        'tf_idf': [(2/4)*log_4_over_3, (1/4)*log_4_over_3, (1/4)*log_4_over_3],
        'index': [0, 2, 3]
    }, {
        'tf_idf': [(3/5)*log_4_over_2, (2/5)*log_4_over_3],
        'index': [1, 3]
    }]
    expected_transformed_schema = dataset_metadata.DatasetMetadata({
        'tf_idf': sch.ColumnSchema(tf.float32, [None],
                                   sch.ListColumnRepresentation()),
        'index': sch.ColumnSchema(tf.int64, [None],
                                  sch.ListColumnRepresentation())
    })
    self.assertAnalyzeAndTransformResults(
        input_data, input_schema, preprocessing_fn, expected_transformed_data,
        expected_transformed_schema)
项目:transform    作者:tensorflow    | 项目源码 | 文件源码
def testUniquesAnalyzerWithFrequencyThreshold(self):
    def preprocessing_fn(inputs):
      return {
          'index1': tft.string_to_int(tf.string_split(inputs['a']),
                                      default_value=-99, frequency_threshold=2),

          # As above but using a string for frequency_threshold (and changing
          # the default_value to showcase things).
          'index2': tft.string_to_int(tf.string_split(inputs['a']),
                                      default_value=-9, frequency_threshold='2')
      }

    input_data = [
        {'a': 'hello hello world'},
        {'a': 'hello goodbye world'},
        {'a': 'hello goodbye foo'}
    ]
    input_metadata = dataset_metadata.DatasetMetadata({
        'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
    })
    # Generated vocab (ordered by frequency, then value) should be:
    # ["hello", "world", "goodbye", "foo"]. After applying frequency_threshold=2
    # this becomes
    # ["hello", "world", "goodbye"].
    expected_data = [
        {'index1': [0, 0, 1], 'index2': [0, 0, 1]},
        {'index1': [0, 2, 1], 'index2': [0, 2, 1]},
        {'index1': [0, 2, -99], 'index2': [0, 2, -9]}
    ]
    expected_metadata = dataset_metadata.DatasetMetadata({
        'index1': sch.ColumnSchema(
            sch.IntDomain(tf.int64, -99, 2, True,
                          'vocab_string_to_int_uniques'),
            [None], sch.ListColumnRepresentation()),
        'index2': sch.ColumnSchema(
            sch.IntDomain(tf.int64, -9, 2, True,
                          'vocab_string_to_int_1_uniques'),
            [None], sch.ListColumnRepresentation())
    })
    self.assertAnalyzeAndTransformResults(
        input_data, input_metadata, preprocessing_fn, expected_data,
        expected_metadata)
项目:transform    作者:tensorflow    | 项目源码 | 文件源码
def testUniquesAnalyzerWithFrequencyThresholdTooHigh(self):
    # Expected to return an empty dict due to too high threshold.
    def preprocessing_fn(inputs):
      return {
          'index1':
              tft.string_to_int(
                  tf.string_split(inputs['a']),
                  default_value=-99,
                  frequency_threshold=77),

          # As above but using a string for frequency_threshold (and changing
          # the default_value to showcase things).
          'index2':
              tft.string_to_int(
                  tf.string_split(inputs['a']),
                  default_value=-9,
                  frequency_threshold='77')
      }

    input_data = [
        {'a': 'hello hello world'},
        {'a': 'hello goodbye world'},
        {'a': 'hello goodbye foo'}
    ]
    input_metadata = dataset_metadata.DatasetMetadata({
        'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
    })
    # Generated vocab (ordered by frequency, then value) should be:
    # ["hello", "world", "goodbye", "foo"]. After applying frequency_threshold=2
    # this becomes empty.
    expected_data = [
        {'index1': [-99, -99, -99], 'index2': [-9, -9, -9]},
        {'index1': [-99, -99, -99], 'index2': [-9, -9, -9]},
        {'index1': [-99, -99, -99], 'index2': [-9, -9, -9]}
    ]
    # Note the vocabs are empty but the tables have size 1 so max_value is 1.
    expected_metadata = dataset_metadata.DatasetMetadata({
        'index1': sch.ColumnSchema(
            sch.IntDomain(tf.int64, -99, 0, True,
                          'vocab_string_to_int_uniques'),
            [None], sch.ListColumnRepresentation()),
        'index2': sch.ColumnSchema(
            sch.IntDomain(tf.int64, -9, 0, True,
                          'vocab_string_to_int_1_uniques'),
            [None], sch.ListColumnRepresentation())
    })
    self.assertAnalyzeAndTransformResults(
        input_data, input_metadata, preprocessing_fn, expected_data,
        expected_metadata)
项目:XMUNMT    作者:XMUNLP    | 项目源码 | 文件源码
def get_evaluation_input(inputs, params):
    with tf.device("/cpu:0"):
        # Create datasets
        datasets = []

        for data in inputs:
            dataset = tf.data.Dataset.from_tensor_slices(data)
            # Split string
            dataset = dataset.map(lambda x: tf.string_split([x]).values,
                                  num_parallel_calls=params.num_threads)
            # Append <eos>
            dataset = dataset.map(
                lambda x: tf.concat([x, [tf.constant(params.eos)]], axis=0),
                num_parallel_calls=params.num_threads
            )
            datasets.append(dataset)

        dataset = tf.data.Dataset.zip(tuple(datasets))

        # Convert tuple to dictionary
        dataset = dataset.map(
            lambda *x: {
                "source": x[0],
                "source_length": tf.shape(x[0])[0],
                "references": x[1:]
            },
            num_parallel_calls=params.num_threads
        )

        dataset = dataset.padded_batch(
            params.eval_batch_size,
            {
                "source": [tf.Dimension(None)],
                "source_length": [],
                "references": (tf.Dimension(None),) * (len(inputs) - 1)
            },
            {
                "source": params.pad,
                "source_length": 0,
                "references": (params.pad,) * (len(inputs) - 1)
            }
        )

        iterator = dataset.make_one_shot_iterator()
        features = iterator.get_next()

        src_table = tf.contrib.lookup.index_table_from_tensor(
            tf.constant(params.vocabulary["source"]),
            default_value=params.mapping["source"][params.unk]
        )
        tgt_table = tf.contrib.lookup.index_table_from_tensor(
            tf.constant(params.vocabulary["target"]),
            default_value=params.mapping["target"][params.unk]
        )
        features["source"] = src_table.lookup(features["source"])
        features["references"] = tuple(
            tgt_table.lookup(item) for item in features["references"]
        )

    return features
项目:hsr    作者:pyk    | 项目源码 | 文件源码
def read_images(data_dir):
    pattern = os.path.join(data_dir, '*.png')
    filenames = tf.train.match_filenames_once(pattern, name='list_files')

    queue = tf.train.string_input_producer(
        filenames, 
        num_epochs=NUM_EPOCHS, 
        shuffle=True, 
        name='queue')

    reader = tf.WholeFileReader()
    filename, content = reader.read(queue, name='read_image')
    filename = tf.Print(
        filename, 
        data=[filename],
        message='loading: ')
    filename_split = tf.string_split([filename], delimiter='/')
    label_id = tf.string_to_number(tf.substr(filename_split.values[1], 
        0, 1), out_type=tf.int32)
    label = tf.one_hot(
        label_id-1, 
        5, 
        on_value=1.0, 
        off_value=0.0, 
        dtype=tf.float32)

    img_tensor = tf.image.decode_png(
        content, 
        dtype=tf.uint8, 
        channels=3,
        name='img_decode')

    # Preprocess the image, Performs random transformations
    # Random flip
    img_tensor_flip = tf.image.random_flip_left_right(img_tensor)

    # Random brightness
    img_tensor_bri = tf.image.random_brightness(img_tensor_flip, 
        max_delta=0.2)

    # Per-image scaling
    img_tensor_std = tf.image.per_image_standardization(img_tensor_bri)

    min_after_dequeue = 1000
    capacity = min_after_dequeue + 3 * BATCH_SIZE
    example_batch, label_batch = tf.train.shuffle_batch(
        [img_tensor_std, label], 
        batch_size=BATCH_SIZE,
        shapes=[(IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS), (NUM_CLASS)],
        capacity=capacity, 
        min_after_dequeue=min_after_dequeue,
        name='train_shuffle')

    return example_batch, label_batch

# `images` is a 4-D tensor with the shape:
# [n_batch, img_height, img_width, n_channel]
项目:nmt    作者:tensorflow    | 项目源码 | 文件源码
def get_infer_iterator(src_dataset,
                       src_vocab_table,
                       batch_size,
                       eos,
                       src_max_len=None):
  src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32)
  src_dataset = src_dataset.map(lambda src: tf.string_split([src]).values)

  if src_max_len:
    src_dataset = src_dataset.map(lambda src: src[:src_max_len])
  # Convert the word strings to ids
  src_dataset = src_dataset.map(
      lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32))
  # Add in the word counts.
  src_dataset = src_dataset.map(lambda src: (src, tf.size(src)))

  def batching_func(x):
    return x.padded_batch(
        batch_size,
        # The entry is the source line rows;
        # this has unknown-length vectors.  The last entry is
        # the source row size; this is a scalar.
        padded_shapes=(
            tf.TensorShape([None]),  # src
            tf.TensorShape([])),  # src_len
        # Pad the source sequences with eos tokens.
        # (Though notice we don't generally need to do this since
        # later on we will be masking out calculations past the true sequence.
        padding_values=(
            src_eos_id,  # src
            0))  # src_len -- unused

  batched_dataset = batching_func(src_dataset)
  batched_iter = batched_dataset.make_initializable_iterator()
  (src_ids, src_seq_len) = batched_iter.get_next()
  return BatchedInput(
      initializer=batched_iter.initializer,
      source=src_ids,
      target_input=None,
      target_output=None,
      source_sequence_length=src_seq_len,
      target_sequence_length=None)
项目:ChessAI    作者:SamRagusa    | 项目源码 | 文件源码
def process_line_as_2d_input_with_ep(the_str):
    """
    NOTES:
    1) I likely won't be using this, opting to instead use the onehot implementation
    """
    with tf.name_scope("process_data_2d"):
        #     with tf.device("/cpu:0"):

        # A tensor referenced when getting indices of characters for the the_values array
        mapping_strings = tf.constant(["0", "1", "K", "Q", "R", "B", "N", "P", "C", "k", "q", "r", "b", "n", "p", "c"])

        the_values = tf.constant(
            [[0, 0, 0, 0, 0, 0, 0, 0],  # 0
             [0, 0, 0, 0, 0, 0, 1, 0],  # 1
             [1, 0, 0, 0, 0, 0, 0, 0],  # K
             [0, 1, 0, 0, 0, 0, 0, 0],  # Q
             [0, 0, 1, 0, 0, 0, 0, 0],  # R
             [0, 0, 0, 1, 0, 0, 0, 0],  # B
             [0, 0, 0, 0, 1, 0, 0, 0],  # N
             [0, 0, 0, 0, 0, 1, 0, 0],  # P
             [0, 0, 0, 0, 0, 0, 0, 1],  # C
             [-1, 0, 0, 0, 0, 0, 0, 0],  # k
             [0, -1, 0, 0, 0, 0, 0, 0],  # q
             [0, 0, -1, 0, 0, 0, 0, 0],  # r
             [0, 0, 0, -1, 0, 0, 0, 0],  # b
             [0, 0, 0, 0, -1, 0, 0, 0],  # n
             [0, 0, 0, 0, 0, -1, 0, 0],  # p
             [0, 0, 0, 0, 0, 0, 0, -1],  # c
             ], dtype=tf.float32)

        # Create the table for getting indices (for the_values) from the information about the board
        the_table = tf.contrib.lookup.index_table_from_tensor(mapping=mapping_strings, name="index_lookup_table")

        data = tf.reshape(
            # Get the values at the given indices
            tf.gather(
                the_values,
                # Get an array of indices corresponding to the array of characters
                the_table.lookup(
                    # Split the string into an array of characters
                    tf.string_split(
                        [the_str],
                        delimiter="").values)),
            [3, 64, 8])

        return data
项目:THUMT    作者:thumt    | 项目源码 | 文件源码
def get_evaluation_input(inputs, params):
    with tf.device("/cpu:0"):
        # Create datasets
        datasets = []

        for data in inputs:
            dataset = tf.data.Dataset.from_tensor_slices(data)
            # Split string
            dataset = dataset.map(lambda x: tf.string_split([x]).values,
                                  num_parallel_calls=params.num_threads)
            # Append <eos>
            dataset = dataset.map(
                lambda x: tf.concat([x, [tf.constant(params.eos)]], axis=0),
                num_parallel_calls=params.num_threads
            )
            datasets.append(dataset)

        dataset = tf.data.Dataset.zip(tuple(datasets))

        # Convert tuple to dictionary
        dataset = dataset.map(
            lambda *x: {
                "source": x[0],
                "source_length": tf.shape(x[0])[0],
                "references": x[1:]
            },
            num_parallel_calls=params.num_threads
        )

        dataset = dataset.padded_batch(
            params.eval_batch_size,
            {
                "source": [tf.Dimension(None)],
                "source_length": [],
                "references": (tf.Dimension(None),) * (len(inputs) - 1)
            },
            {
                "source": params.pad,
                "source_length": 0,
                "references": (params.pad,) * (len(inputs) - 1)
            }
        )

        iterator = dataset.make_one_shot_iterator()
        features = iterator.get_next()

        src_table = tf.contrib.lookup.index_table_from_tensor(
            tf.constant(params.vocabulary["source"]),
            default_value=params.mapping["source"][params.unk]
        )
        tgt_table = tf.contrib.lookup.index_table_from_tensor(
            tf.constant(params.vocabulary["target"]),
            default_value=params.mapping["target"][params.unk]
        )
        features["source"] = src_table.lookup(features["source"])
        features["references"] = tuple(
            tgt_table.lookup(item) for item in features["references"]
        )

    return features
项目:monodepth    作者:mrharicot    | 项目源码 | 文件源码
def __init__(self, data_path, filenames_file, params, dataset, mode):
        self.data_path = data_path
        self.params = params
        self.dataset = dataset
        self.mode = mode

        self.left_image_batch  = None
        self.right_image_batch = None

        input_queue = tf.train.string_input_producer([filenames_file], shuffle=False)
        line_reader = tf.TextLineReader()
        _, line = line_reader.read(input_queue)

        split_line = tf.string_split([line]).values

        # we load only one image for test, except if we trained a stereo model
        if mode == 'test' and not self.params.do_stereo:
            left_image_path  = tf.string_join([self.data_path, split_line[0]])
            left_image_o  = self.read_image(left_image_path)
        else:
            left_image_path  = tf.string_join([self.data_path, split_line[0]])
            right_image_path = tf.string_join([self.data_path, split_line[1]])
            left_image_o  = self.read_image(left_image_path)
            right_image_o = self.read_image(right_image_path)

        if mode == 'train':
            # randomly flip images
            do_flip = tf.random_uniform([], 0, 1)
            left_image  = tf.cond(do_flip > 0.5, lambda: tf.image.flip_left_right(right_image_o), lambda: left_image_o)
            right_image = tf.cond(do_flip > 0.5, lambda: tf.image.flip_left_right(left_image_o),  lambda: right_image_o)

            # randomly augment images
            do_augment  = tf.random_uniform([], 0, 1)
            left_image, right_image = tf.cond(do_augment > 0.5, lambda: self.augment_image_pair(left_image, right_image), lambda: (left_image, right_image))

            left_image.set_shape( [None, None, 3])
            right_image.set_shape([None, None, 3])

            # capacity = min_after_dequeue + (num_threads + a small safety margin) * batch_size
            min_after_dequeue = 2048
            capacity = min_after_dequeue + 4 * params.batch_size
            self.left_image_batch, self.right_image_batch = tf.train.shuffle_batch([left_image, right_image],
                        params.batch_size, capacity, min_after_dequeue, params.num_threads)

        elif mode == 'test':
            self.left_image_batch = tf.stack([left_image_o,  tf.image.flip_left_right(left_image_o)],  0)
            self.left_image_batch.set_shape( [2, None, None, 3])

            if self.params.do_stereo:
                self.right_image_batch = tf.stack([right_image_o,  tf.image.flip_left_right(right_image_o)],  0)
                self.right_image_batch.set_shape( [2, None, None, 3])
项目:kaggle-youtube-8m    作者:liufuyang    | 项目源码 | 文件源码
def make_preprocessing_fn(frequency_threshold):
  """Creates a preprocessing function for criteo.

  Args:
    frequency_threshold: The frequency_threshold used when generating
      vocabularies for categorical and text features.

  Returns:
    A preprocessing function.
  """

  def preprocessing_fn(inputs):
    """User defined preprocessing function for criteo columns.

    Args:
      inputs: dictionary of input `tensorflow_transform.Column`.
    Returns:
      A dictionary of `tensorflow_transform.Column` representing the transformed
          columns.
    """
    # TODO(b/35001605) Make this "passthrough" more DRY.
    result = {'score': inputs['score'], 'toplevel': inputs['toplevel']}

    result['subreddit_id'] = tft.string_to_int(
        inputs['subreddit'], frequency_threshold=frequency_threshold)

    # TODO(b/35318962): Obviate the need for this workaround on Dense features.
    # FeatureColumns expect shape (batch_size, 1), not just (batch_size)
    # All features added to results up to this point are dense and require this
    # workaround. All following features will be sparse.
    result = {
        k: tft.map(lambda x: tf.expand_dims(x, -1), v)
        for k, v in result.items()
    }

    for name in ('author', 'comment_body', 'comment_parent_body'):
      words = tft.map(tf.string_split, inputs[name])
      # TODO(b/33467613) Translate these to bag-of-words style sparse features.
      result[name + '_bow'] = tft.string_to_int(
          words, frequency_threshold=frequency_threshold)

    return result

  return preprocessing_fn