Java 类org.apache.hadoop.util.bloom.BloomFilter 实例源码

项目:bloomfilter-course    文件:MRBloomFilter.java   
@Override
protected void setup(Context context) throws IOException,
        InterruptedException {

    // TODO Create a FileSystem object
    FileSystem fs = FileSystem.get(context.getConfiguration());

    // TODO get the cache files from the context
    URI[] uris = context.getCacheFiles();

    if (uris.length > 0) {
        // TODO create a new Bloom filter
        filter = new BloomFilter();

        // TODO call the filter's readFields method, passing in an FSDataInputStream
        filter.readFields(fs.open(new Path(uris[0].toString())));
    } else {
        throw new IOException(
                "Bloom filter file not in DistributedCache");
    }
}
项目:spork-streaming    文件:BuildBloom.java   
@Override
public Tuple exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) return null;

    // Strip off the initial level of bag
    DataBag values = (DataBag)input.get(0);
    Iterator<Tuple> it = values.iterator();
    Tuple t = it.next();

    // If the input tuple has only one field, then we'll extract
    // that field and serialize it into a key.  If it has multiple
    // fields, we'll serialize the whole tuple.
    byte[] b;
    if (t.size() == 1) b = DataType.toBytes(t.get(0));
    else b = DataType.toBytes(t, DataType.TUPLE);

    Key k = new Key(b);
    filter = new BloomFilter(vSize, numHash, hType);
    filter.add(k);

    return TupleFactory.getInstance().newTuple(bloomOut());
}
项目:spork    文件:Bloom.java   
private void init() throws IOException {
    filter = new BloomFilter();
    String dir = "./" + getFilenameFromPath(bloomFile);
    String[] partFiles = new File(dir)
            .list(new FilenameFilter() {
                @Override
                public boolean accept(File current, String name) {
                    return name.startsWith("part");
                }
            });

    String dcFile = dir + "/" + partFiles[0];
    DataInputStream dis = new DataInputStream(new FileInputStream(dcFile));
    try {
        filter.readFields(dis);
    } finally {
        dis.close();
    }
}
项目:spork    文件:BuildBloom.java   
@Override
public Tuple exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) return null;

    // Strip off the initial level of bag
    DataBag values = (DataBag)input.get(0);
    Iterator<Tuple> it = values.iterator();
    Tuple t = it.next();

    // If the input tuple has only one field, then we'll extract
    // that field and serialize it into a key.  If it has multiple
    // fields, we'll serialize the whole tuple.
    byte[] b;
    if (t.size() == 1) b = DataType.toBytes(t.get(0));
    else b = DataType.toBytes(t, DataType.TUPLE);

    Key k = new Key(b);
    filter = new BloomFilter(vSize, numHash, hType);
    filter.add(k);

    return TupleFactory.getInstance().newTuple(bloomOut());
}
项目:PonIC    文件:BuildBloom.java   
@Override
public Tuple exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) return null;

    // Strip off the initial level of bag
    DataBag values = (DataBag)input.get(0);
    Iterator<Tuple> it = values.iterator();
    Tuple t = it.next();

    // If the input tuple has only one field, then we'll extract
    // that field and serialize it into a key.  If it has multiple
    // fields, we'll serialize the whole tuple.
    byte[] b;
    if (t.size() == 1) b = DataType.toBytes(t.get(0));
    else b = DataType.toBytes(t, DataType.TUPLE);

    Key k = new Key(b);
    filter = new BloomFilter(vSize, numHash, hType);
    filter.add(k);

    return TupleFactory.getInstance().newTuple(bloomOut());
}
项目:sedge    文件:BuildBloom.java   
@Override
public Tuple exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) return null;

    // Strip off the initial level of bag
    DataBag values = (DataBag)input.get(0);
    Iterator<Tuple> it = values.iterator();
    Tuple t = it.next();

    // If the input tuple has only one field, then we'll extract
    // that field and serialize it into a key.  If it has multiple
    // fields, we'll serialize the whole tuple.
    byte[] b;
    if (t.size() == 1) b = DataType.toBytes(t.get(0));
    else b = DataType.toBytes(t, DataType.TUPLE);

    Key k = new Key(b);
    filter = new BloomFilter(vSize, numHash, hType);
    filter.add(k);

    return TupleFactory.getInstance().newTuple(bloomOut());
}
项目:hiped2    文件:BloomFilterDumper.java   
public static BloomFilter readFromAvro(InputStream is) throws IOException {
  DataFileStream<Object> reader =
      new DataFileStream<Object>(
          is, new GenericDatumReader<Object>());

  reader.hasNext();
  BloomFilter filter = new BloomFilter();
  AvroBytesRecord
      .fromGenericRecord((GenericRecord) reader.next(), filter);
  IOUtils.closeQuietly(is);
  IOUtils.closeQuietly(reader);

  return filter;
}
项目:hiped2    文件:BloomFilterCreator.java   
/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT));
  Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));

  Configuration conf = super.getConf();

  JobConf job = new JobConf(conf);
  job.setJarByClass(BloomFilterCreator.class);

  job.set(AvroJob.OUTPUT_SCHEMA, AvroBytesRecord.SCHEMA.toString());
  job.set(AvroJob.OUTPUT_CODEC, SnappyCodec.class.getName());

  job.setInputFormat(KeyValueTextInputFormat.class);
  job.setOutputFormat(AvroOutputFormat.class);

  job.setMapperClass(Map.class);
  job.setReducerClass(Reduce.class);

  job.setMapOutputKeyClass(NullWritable.class);
  job.setMapOutputValueClass(BloomFilter.class);

  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(BloomFilter.class);

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  return JobClient.runJob(job).isSuccessful() ? 0 : 1;
}
项目:hiped2    文件:BloomFilterCreator.java   
@Override
public void map(Text key, Text value,
                OutputCollector<NullWritable, BloomFilter> output,
                Reporter reporter) throws IOException {

  System.out.println("K[" + key + "]");

  int age = Integer.valueOf(value.toString());
  if (age > 30) {
    filter.add(new Key(key.toString().getBytes()));
  }
  collector = output;
}
项目:hiped2    文件:BloomFilterCreator.java   
@Override
public void reduce(NullWritable key, Iterator<BloomFilter> values,
                   OutputCollector<AvroWrapper<GenericRecord>,
                       NullWritable> output,
                   Reporter reporter) throws IOException {
  while (values.hasNext()) {
    BloomFilter bf = values.next();
    filter.or(bf);
    System.out.println(filter);
  }
  collector = output;
}
项目:hiped2    文件:BloomFilterCreator.java   
public static BloomFilter readFromAvro(InputStream is) throws IOException {
  DataFileStream<Object> reader =
      new DataFileStream<Object>(
          is, new GenericDatumReader<Object>());

  reader.hasNext();
  BloomFilter filter = new BloomFilter();
  AvroBytesRecord
      .fromGenericRecord((GenericRecord) reader.next(), filter);
  IOUtils.closeQuietly(is);
  IOUtils.closeQuietly(reader);

  return filter;
}
项目:bloomfilter-course    文件:Trainer.java   
public BloomFilter createBloomFilter(int numMembers, float falsePosRate) {
    // TODO calculate the optimal Bloom filter size
    // TODO and the optimal number of hash functions
    int vectorSize = getOptimalBloomFilterSize(numMembers, falsePosRate);
    int nbHash = getOptimalK(numMembers, vectorSize);

    // TODO create new Bloom filter
    BloomFilter filter = new BloomFilter(vectorSize, nbHash,
            Hash.MURMUR_HASH);

    return filter;
}
项目:bloomfilter-course    文件:Trainer.java   
public BloomFilter createBloomFilter(int numMembers, float falsePosRate) {
    // TODO calculate the optimal Bloom filter size
    // TODO and the optimal number of hash functions

    // TODO create new Bloom filter

    return null;
}
项目:Gaffer    文件:AccumuloIDWithinSetRetriever.java   
ElementIteratorReadIntoMemory() throws RetrieverException {
    vertices = extractVertices(seedsIter);

    // Create Bloom filter, read through set of entities and add them to
    // Bloom filter
    final BloomFilter filter = BloomFilterUtils.getBloomFilter(store.getProperties().getFalsePositiveRate(),
            vertices.size(), store.getProperties().getMaxBloomFilterToPassToAnIterator());
    addToBloomFilter(vertices, filter);

    initialise(filter);
}
项目:Gaffer    文件:AccumuloIDBetweenSetsRetriever.java   
ElementIteratorReadIntoMemory() throws RetrieverException {
    verticesA = extractVertices(seedSetAIter);
    verticesB = extractVertices(seedSetBIter);

    // Create Bloom filter, read through set of entities B and add them
    // to Bloom filter
    final BloomFilter filter = BloomFilterUtils.getBloomFilter(store.getProperties().getFalsePositiveRate(),
            verticesB.size(), store.getProperties().getMaxBloomFilterToPassToAnIterator());
    addToBloomFilter(verticesB, filter);
    initialise(filter);
}
项目:Gaffer    文件:AccumuloSetRetriever.java   
protected void addToBloomFilter(final Iterator<? extends Object> vertices, final BloomFilter filter)
        throws RetrieverException {
    try {
        while (vertices.hasNext()) {
            addToBloomFilter(vertices.next(), filter);
        }
    } finally {
        CloseableUtil.close(vertices);
    }
}
项目:Gaffer    文件:AccumuloSetRetriever.java   
protected void addToBloomFilter(final Iterator<? extends EntityId> seeds, final BloomFilter filter1,
                                final BloomFilter filter2) throws RetrieverException {
    try {
        while (seeds.hasNext()) {
            addToBloomFilter(seeds.next(), filter1, filter2);
        }
    } finally {
        CloseableUtil.close(seeds);
    }
}
项目:Gaffer    文件:AccumuloSetRetriever.java   
private void addToBloomFilter(final Object vertex, final BloomFilter filter) throws RetrieverException {
    try {
        filter.add(new org.apache.hadoop.util.bloom.Key(elementConverter.serialiseVertex(vertex)));
    } catch (final AccumuloElementConversionException e) {
        throw new RetrieverException("Failed to add identifier to the bloom key", e);
    }
}
项目:spork-streaming    文件:Bloom.java   
/**
 * For testing only, do not use directly.
 */
public void setFilter(DataByteArray dba) throws IOException {
    DataInputStream dis = new DataInputStream(new
        ByteArrayInputStream(dba.get()));
    filter = new BloomFilter();
    filter.readFields(dis);
}
项目:spork-streaming    文件:BuildBloomBase.java   
protected DataByteArray bloomOr(Tuple input) throws IOException {
    filter = new BloomFilter(vSize, numHash, hType);

    try {
        DataBag values = (DataBag)input.get(0);
        for (Iterator<Tuple> it = values.iterator(); it.hasNext();) {
            Tuple t = it.next();
            filter.or(bloomIn((DataByteArray)t.get(0)));
        }
    } catch (ExecException ee) {
        throw new IOException(ee);
    }

    return bloomOut();
}
项目:spork-streaming    文件:BuildBloomBase.java   
protected BloomFilter bloomIn(DataByteArray b) throws IOException {
    DataInputStream dis = new DataInputStream(new
        ByteArrayInputStream(b.get()));
    BloomFilter f = new BloomFilter();
    f.readFields(dis);
    return f;
}
项目:spork    文件:Bloom.java   
/**
 * For testing only, do not use directly.
 */
public void setFilter(DataByteArray dba) throws IOException {
    DataInputStream dis = new DataInputStream(new
        ByteArrayInputStream(dba.get()));
    filter = new BloomFilter();
    filter.readFields(dis);
}
项目:spork    文件:BuildBloomBase.java   
protected DataByteArray bloomOr(Tuple input) throws IOException {
    filter = new BloomFilter(vSize, numHash, hType);

    try {
        DataBag values = (DataBag)input.get(0);
        for (Iterator<Tuple> it = values.iterator(); it.hasNext();) {
            Tuple t = it.next();
            filter.or(bloomIn((DataByteArray)t.get(0)));
        }
    } catch (ExecException ee) {
        throw new IOException(ee);
    }

    return bloomOut();
}
项目:spork    文件:BuildBloomBase.java   
protected BloomFilter bloomIn(DataByteArray b) throws IOException {
    DataInputStream dis = new DataInputStream(new
        ByteArrayInputStream(b.get()));
    BloomFilter f = new BloomFilter();
    f.readFields(dis);
    return f;
}
项目:PonIC    文件:Bloom.java   
/**
 * For testing only, do not use directly.
 */
public void setFilter(DataByteArray dba) throws IOException {
    DataInputStream dis = new DataInputStream(new
        ByteArrayInputStream(dba.get()));
    filter = new BloomFilter();
    filter.readFields(dis);
}
项目:PonIC    文件:BuildBloomBase.java   
protected DataByteArray bloomOr(Tuple input) throws IOException {
    filter = new BloomFilter(vSize, numHash, hType);

    try {
        DataBag values = (DataBag)input.get(0);
        for (Iterator<Tuple> it = values.iterator(); it.hasNext();) {
            Tuple t = it.next();
            filter.or(bloomIn((DataByteArray)t.get(0)));
        }
    } catch (ExecException ee) {
        throw new IOException(ee);
    }

    return bloomOut();
}
项目:PonIC    文件:BuildBloomBase.java   
protected BloomFilter bloomIn(DataByteArray b) throws IOException {
    DataInputStream dis = new DataInputStream(new
        ByteArrayInputStream(b.get()));
    BloomFilter f = new BloomFilter();
    f.readFields(dis);
    return f;
}
项目:sedge    文件:Bloom.java   
/**
 * For testing only, do not use directly.
 */
public void setFilter(DataByteArray dba) throws IOException {
    DataInputStream dis = new DataInputStream(new
        ByteArrayInputStream(dba.get()));
    filter = new BloomFilter();
    filter.readFields(dis);
}
项目:sedge    文件:BuildBloomBase.java   
protected DataByteArray bloomOr(Tuple input) throws IOException {
    filter = new BloomFilter(vSize, numHash, hType);

    try {
        DataBag values = (DataBag)input.get(0);
        for (Iterator<Tuple> it = values.iterator(); it.hasNext();) {
            Tuple t = it.next();
            filter.or(bloomIn((DataByteArray)t.get(0)));
        }
    } catch (ExecException ee) {
        throw new IOException(ee);
    }

    return bloomOut();
}
项目:sedge    文件:BuildBloomBase.java   
protected BloomFilter bloomIn(DataByteArray b) throws IOException {
    DataInputStream dis = new DataInputStream(new
        ByteArrayInputStream(b.get()));
    BloomFilter f = new BloomFilter();
    f.readFields(dis);
    return f;
}
项目:hiped2    文件:BloomFilterDumper.java   
public static BloomFilter fromFile(File f) throws IOException {
  return readFromAvro(FileUtils.openInputStream(f));
}
项目:hiped2    文件:BloomFilterDumper.java   
public static BloomFilter fromPath(Configuration config, Path path) throws IOException {
  FileSystem hdfs = path.getFileSystem(config);

  return readFromAvro(hdfs.open(path));
}
项目:hiped2    文件:BloomFilterCreator.java   
@Override
protected void reduce(NullWritable key, Iterable<BloomFilter> values, Context context) throws IOException, InterruptedException {
  for (BloomFilter bf : values) {
    filter.or(bf);
  }
}
项目:hiped2    文件:BloomFilterCreator.java   
public static BloomFilter fromFile(File f) throws IOException {
  return readFromAvro(FileUtils.openInputStream(f));
}
项目:bloomfilter-course    文件:Tester.java   
public void redisMembershipTestWithFilter(Path input, Path bloom)
        throws Exception {

    System.out.println("Testing Redis set membership of " + input
            + " using a Bloom filter " + bloom);

    // TODO create a fileSystem object
    FileSystem fs = FileSystem.get(getConf());

    // TODO connect to Redis at localhost, port 6379
    jedis = new Jedis("localhost", 6379);
    jedis.connect();

    // TODO Create a new BloomFilter object
    BloomFilter filter = new BloomFilter();

    // TODO call readFields with an FSDataInputStream from the file
    filter.readFields(fs.open(bloom));

    // TODO Open the testing file for read
    String line = null;
    int numBFhits = 0, numhits = 0, numlines = 0;
    BufferedReader rdr = new BufferedReader(new InputStreamReader(
            fs.open(input)));

    // TODO create a new Key to re-use
    Key key = new Key();

    long start = System.currentTimeMillis();
    while ((line = rdr.readLine()) != null) {
        // TODO increment numlines
        ++numlines;

        // TODO set the bytes of the key to line's bytes with a weight of 1.0
        key.set(line.getBytes(), 1.0);

        // TODO membership test the key
        if (filter.membershipTest(key)) {
            // TODO increment numBFhits
            ++numBFhits;

            // TODO test jedis using sismember
            if (jedis.sismember(REDIS_SET_KEY, line)) {
                // TODO increment numhits
                ++numhits;
            }
        }
    }
    long finish = System.currentTimeMillis();

    // TODO close the file reader and Redis client
    rdr.close();
    jedis.disconnect();

    System.out.println("Took " + (finish - start) + " ms to check Redis "
            + numlines + " times for " + numhits
            + " successful tests.  Bloom filter hits: " + numBFhits
            + " False postives: " + (numBFhits - numhits));
}
项目:bloomfilter-course    文件:Trainer.java   
@Override
public int run(String[] args) throws Exception {

    if (args.length != 4) {
        System.err
                .println("Usage: Trainer <totrain> <nummembers> <falseposrate> <bfoutfile>");
        return 1;
    }

    // Parse command line arguments
    Path inputFile = new Path(args[0]);
    int numMembers = Integer.parseInt(args[1]);
    float falsePosRate = Float.parseFloat(args[2]);
    Path bfFile = new Path(args[3]);

    // TODO Create a new Jedis object using localhost at port 6379
    jedis = new Jedis("localhost", 6379);

    // TODO delete the REDIS_SET_KEY
    jedis.del(REDIS_SET_KEY);

    // TODO Create a new Bloom filter
    BloomFilter filter = createBloomFilter(numMembers, falsePosRate);

    // TODO open the file for read
    FileSystem fs = FileSystem.get(getConf());
    String line = null;
    int numRecords = 0;
    BufferedReader rdr = new BufferedReader(new InputStreamReader(
            fs.open(inputFile)));

    while ((line = rdr.readLine()) != null) {
        // TODO if the line is not empty
        if (!line.isEmpty()) {
            // TODO add the line to the Bloom filter
            filter.add(new Key(line.getBytes()));

            // TODO use Jedis client's "sadd" method to set
            jedis.sadd(REDIS_SET_KEY, line);

            // TODO increment numRecords
            ++numRecords;
        }
    }

    // TODO Close reader, disconnect Jedis client
    rdr.close();
    jedis.disconnect();

    System.out.println("Trained Bloom filter with " + numRecords
            + " entries.");

    System.out.println("Serializing Bloom filter to HDFS at " + bfFile);

    // TODO create anew FSDataOutputStream using the FileSystem
    FSDataOutputStream strm = fs.create(bfFile);

    // TODO pass the stream to the Bloom filter
    filter.write(strm);

    // TODO close the stream
    strm.flush();
    strm.close();

    System.out.println("Done training Bloom filter.");
    return 0;
}
项目:Gaffer    文件:AccumuloSetRetriever.java   
protected void addToBloomFilter(final Iterable<? extends Object> vertices, final BloomFilter filter)
        throws RetrieverException {
    addToBloomFilter(vertices.iterator(), filter);
}
项目:Gaffer    文件:AccumuloSetRetriever.java   
protected void addToBloomFilter(final EntityId seed, final BloomFilter filter1, final BloomFilter filter2)
        throws RetrieverException {
    addToBloomFilter(seed.getVertex(), filter1);
    addToBloomFilter(seed.getVertex(), filter2);
}
项目:Gaffer    文件:AbstractCoreKeyIteratorSettingsFactory.java   
@Override
public IteratorSetting getBloomFilterIteratorSetting(final BloomFilter filter) throws IteratorSettingException {
    return new IteratorSettingBuilder(AccumuloStoreConstants.BLOOM_FILTER_ITERATOR_PRIORITY,
            AccumuloStoreConstants.BLOOM_FILTER_ITERATOR_NAME, CoreKeyBloomFilterIterator.class).bloomFilter(filter).build();
}
项目:spork-streaming    文件:Bloom.java   
private void init() throws IOException {
    filter = new BloomFilter();
    String dcFile = "./" + getFilenameFromPath(bloomFile) +
        "/part-r-00000";
    filter.readFields(new DataInputStream(new FileInputStream(dcFile)));
}