protected void setupOutput(final Job job, final AddElementsFromHdfs operation, final HBaseStore store) throws IOException { FileOutputFormat.setOutputPath(job, new Path(operation.getOutputPath())); final String stagingDir = operation.getOption(HBaseStoreConstants.OPERATION_HDFS_STAGING_PATH); if (null != stagingDir && !stagingDir.isEmpty()) { job.getConfiguration().set(HConstants.TEMPORARY_FS_DIRECTORY_KEY, stagingDir); } try { HFileOutputFormat2.configureIncrementalLoad( job, store.getTable(), store.getConnection().getRegionLocator(store.getTableName()) ); } catch (final StoreException e) { throw new RuntimeException(e); } }
public void doBulkLoadSinglePut(boolean verbose, HTable table) throws IOException, ClassNotFoundException, InterruptedException, Exception { ClassTools.preLoad(LoadIncrementalHFiles.class); // setup the bulkload temp folder HDFSPath bulkLoadPath = new HDFSPath( getConfiguration(), "/tmp/" + UUID.randomUUID().toString()); if (bulkLoadPath.existsDir()) { bulkLoadPath.trash(); } // setup the job setMapOutputKeyClass(ImmutableBytesWritable.class); setMapOutputValueClass(Put.class); HFileOutputFormat2.configureIncrementalLoad(this, table); HFileOutputFormat2.setOutputPath(this, bulkLoadPath); if (waitForCompletion(verbose)) { // Load generated HFiles into table LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf); loader.doBulkLoad(bulkLoadPath, table); } else { log.info("loading failed."); } }
@Override public void init() throws IOException { super.init(); Configuration taskConf = new Configuration(); Path stagingResultDir = new Path(stagingDir, TajoConstants.RESULT_DIR_NAME); taskConf.set(FileOutputFormat.OUTDIR, stagingResultDir.toString()); ExecutionBlockId ebId = taskAttemptId.getTaskId().getExecutionBlockId(); writerContext = new TaskAttemptContextImpl(taskConf, new TaskAttemptID(ebId.getQueryId().toString(), ebId.getId(), TaskType.MAP, taskAttemptId.getTaskId().getId(), taskAttemptId.getId())); HFileOutputFormat2 hFileOutputFormat2 = new HFileOutputFormat2(); try { writer = hFileOutputFormat2.getRecordWriter(writerContext); committer = new FileOutputCommitter(FileOutputFormat.getOutputPath(writerContext), writerContext); workingFilePath = committer.getWorkPath(); } catch (InterruptedException e) { throw new IOException(e.getMessage(), e); } LOG.info("Created hbase file writer: " + workingFilePath); }
@Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: bulkload [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D" + SKIP_INVALID_PROPERTY + "=true] [-D" + SPLIT_BITS_PROPERTY + "=8] [-D" + DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + OVERRIDE_CONTEXT_PROPERTY + "=true] <input_path(s)> <output_path> <table_name>"); return -1; } TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, AbstractRDFHandler.class, RDFFormat.class, RDFParser.class); HBaseConfiguration.addHbaseResources(getConf()); getConf().setLong(DEFAULT_TIMESTAMP_PROPERTY, getConf().getLong(DEFAULT_TIMESTAMP_PROPERTY, System.currentTimeMillis())); Job job = Job.getInstance(getConf(), "HalyardBulkLoad -> " + args[1] + " -> " + args[2]); job.setJarByClass(HalyardBulkLoad.class); job.setMapperClass(RDFMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); job.setInputFormatClass(RioFileInputFormat.class); job.setSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], true, getConf().getInt(SPLIT_BITS_PROPERTY, 3))) { HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator()); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { if (getConf().getBoolean(TRUNCATE_PROPERTY, false)) { HalyardTableUtils.truncateTable(hTable).close(); } new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable); LOG.info("Bulk Load Completed.."); return 0; } } return -1; }
@Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: hiveload -D" + RDF_MIME_TYPE_PROPERTY + "='application/ld+json' [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D" + HIVE_DATA_COLUMN_INDEX_PROPERTY + "=3] [-D" + BASE_URI_PROPERTY + "='http://my_base_uri/'] [-D" + HalyardBulkLoad.SPLIT_BITS_PROPERTY + "=8] [-D" + HalyardBulkLoad.DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + HalyardBulkLoad.OVERRIDE_CONTEXT_PROPERTY + "=true] <hive_table_name> <output_path> <hbase_table_name>"); return -1; } TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, AbstractRDFHandler.class, RDFFormat.class, RDFParser.class); HBaseConfiguration.addHbaseResources(getConf()); getConf().setLong(DEFAULT_TIMESTAMP_PROPERTY, getConf().getLong(DEFAULT_TIMESTAMP_PROPERTY, System.currentTimeMillis())); Job job = Job.getInstance(getConf(), "HalyardHiveLoad -> " + args[1] + " -> " + args[2]); int i = args[0].indexOf('.'); HCatInputFormat.setInput(job, i > 0 ? args[0].substring(0, i) : null, args[0].substring(i + 1)); job.setJarByClass(HalyardHiveLoad.class); job.setMapperClass(HiveMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); job.setInputFormatClass(HCatInputFormat.class); job.setSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], true, getConf().getInt(HalyardBulkLoad.SPLIT_BITS_PROPERTY, 3))) { HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator()); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable); LOG.info("Bulk Load Completed.."); return 0; } } return -1; }
/** * Submits the job and waits for completion. * @param job job * @param outputPath output path * @throws Exception */ private void configureRunnableJobUsingBulkLoad(Job job, Path outputPath, TableName outputTableName, boolean skipDependencyJars) throws Exception { job.setMapperClass(getBulkMapperClass()); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); final Configuration configuration = job.getConfiguration(); try (Connection conn = ConnectionFactory.createConnection(configuration); Admin admin = conn.getAdmin(); Table table = conn.getTable(outputTableName); RegionLocator regionLocator = conn.getRegionLocator(outputTableName)) { HFileOutputFormat2.configureIncrementalLoad(job, table, regionLocator); if (skipDependencyJars) { job.getConfiguration().unset("tmpjars"); } boolean status = job.waitForCompletion(true); if (!status) { LOG.error("IndexTool job failed!"); throw new Exception("IndexTool job failed: " + job.toString()); } LOG.info("Loading HFiles from {}", outputPath); LoadIncrementalHFiles loader = new LoadIncrementalHFiles(configuration); loader.doBulkLoad(outputPath, admin, table, regionLocator); } FileSystem.get(configuration).delete(outputPath, true); }
/** * Sets up the actual job. * @param args The command line parameters. * @return The newly created job. * @throws IOException When setting up the job fails. */ public Job createSubmittableJob(String[] args) throws IOException { Configuration conf = getConf(); String inputDirs = args[0]; String tabName = args[1]; conf.setStrings(TABLES_KEY, tabName); conf.set(FileInputFormat.INPUT_DIR, inputDirs); Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + EnvironmentEdgeManager.currentTime())); job.setJarByClass(MapReduceHFileSplitterJob.class); job.setInputFormatClass(HFileInputFormat.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY); if (hfileOutPath != null) { LOG.debug("add incremental job :" + hfileOutPath + " from " + inputDirs); TableName tableName = TableName.valueOf(tabName); job.setMapperClass(HFileCellMapper.class); job.setReducerClass(CellSortReducer.class); Path outputDir = new Path(hfileOutPath); FileOutputFormat.setOutputPath(job, outputDir); job.setMapOutputValueClass(MapReduceExtendedCell.class); try (Connection conn = ConnectionFactory.createConnection(conf); Table table = conn.getTable(tableName); RegionLocator regionLocator = conn.getRegionLocator(tableName)) { HFileOutputFormat2.configureIncrementalLoad(job, table.getDescriptor(), regionLocator); } LOG.debug("success configuring load incremental job"); TableMapReduceUtil.addDependencyJars(job.getConfiguration(), org.apache.hbase.thirdparty.com.google.common.base.Preconditions.class); } else { throw new IOException("No bulk output directory specified"); } return job; }
@Override public int run(String[] args) throws Exception { Configuration conf = HBaseConfiguration.create(); String[] remainArgs = remainArgs(args, conf); options = new LoadVCFToHBaseOptions(); options.parse(remainArgs); options.setHadoopConf(remainArgs, conf); conf.addResource(new Path(options.getConfig() + "hbase-site.xml")); conf.addResource(new Path(options.getConfig() + "core-site.xml")); conf.set("vcfHeader", options.getHeaderOutput()); Job job = new Job(conf); createTable(conf,options.getTableName()); MultipleVCFHeader vcfHeaders = new MultipleVCFHeader(); vcfHeaders.mergeHeader(new Path(options.getInput()),options.getHeaderOutput(), job, false); job.setJobName("vcf to hbase"); job.setNumReduceTasks(options.getReducerNumber()); job.setInputFormatClass(VCFMultipleInputFormat.class); job.setJarByClass(LoadVCFToHBase.class); job.setMapperClass(VCFToHBaseMapper.class); job.setReducerClass(PutSortReducer.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(Put.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Put.class); FileInputFormat.setInputPaths(job, new Path(options.getInput())); FileOutputFormat.setOutputPath(job, new Path(options.getHFileOutput())); HFileOutputFormat2.configureIncrementalLoad(job, new HTable(conf,options.getTableName())); if (job.waitForCompletion(true)) { LoadHFile2HBase(conf,options.getTableName(),options.getHFileOutput()); return 0; } else { return 1; } }
@Override public int run(String[] args) throws Exception { Configuration conf = HBaseConfiguration.create(); String[] remainArgs = remainArgs(args, conf); options = new DBNSFPToHbaseOptions(); options.parse(remainArgs); options.setHadoopConf(remainArgs, conf); tableName = TableName.valueOf(options.getTableName()); conf.set("DEFAULT_COLUMN_FAMILY", "data"); conf.addResource(new Path(options.getConfig() + "hbase-site.xml")); conf.addResource(new Path(options.getConfig() + "core-site.xml")); conn = ConnectionFactory.createConnection(conf); setHeader(new Path(options.getInput()), conf); long reduceThreshMem = (long) (1 << 28); conf.setLong("putsortreducer.row.threshold", reduceThreshMem); Job job = Job.getInstance(conf, "dbNSFPtoHbase"); createTable(tableName); job.setJarByClass(org.bgi.flexlab.gaea.tools.mapreduce.annotator.databaseload.DBNSFPToHbase.class); job.setMapperClass(DBNSFPToHbaseMapper.class); job.setReducerClass(PutSortReducer.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(Put.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Put.class); FileInputFormat.setInputPaths(job, new Path(options.getInput())); FileOutputFormat.setOutputPath(job, new Path(options.getHFileOutput())); // HFileOutputFormat2.configureIncrementalLoad(job, new HTable(conf,options.getTableName())); HFileOutputFormat2.configureIncrementalLoad(job, conn.getTable(tableName), conn.getRegionLocator(tableName)); if (job.waitForCompletion(true)) { LoadHFile2HBase(conf, tableName, options.getHFileOutput()); conn.close(); return 0; } else { conn.close(); return 1; } }
@Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: bulkupdate [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] <input_file_with_SPARQL_queries> <output_path> <table_name>"); return -1; } TableMapReduceUtil.addDependencyJars(getConf(), HalyardExport.class, NTriplesUtil.class, Rio.class, AbstractRDFHandler.class, RDFFormat.class, RDFParser.class, HTable.class, HBaseConfiguration.class, AuthenticationProtos.class, Trace.class, Gauge.class); HBaseConfiguration.addHbaseResources(getConf()); getConf().setStrings(TABLE_NAME_PROPERTY, args[2]); getConf().setLong(DEFAULT_TIMESTAMP_PROPERTY, getConf().getLong(DEFAULT_TIMESTAMP_PROPERTY, System.currentTimeMillis())); Job job = Job.getInstance(getConf(), "HalyardBulkUpdate -> " + args[1] + " -> " + args[2]); NLineInputFormat.setNumLinesPerSplit(job, 1); job.setJarByClass(HalyardBulkUpdate.class); job.setMapperClass(SPARQLMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); job.setInputFormatClass(NLineInputFormat.class); job.setSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], false, 0)) { HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator()); FileInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable); LOG.info("Bulk Update Completed.."); return 0; } } return -1; }
@Test public void shouldSetupJob() throws IOException, StoreException { // Given final JobConf localConf = createLocalConf(); final FileSystem fs = FileSystem.getLocal(localConf); fs.mkdirs(new Path(outputDir)); final HBaseAddElementsFromHdfsJobFactory factory = new HBaseAddElementsFromHdfsJobFactory(); final Job job = mock(Job.class); final AddElementsFromHdfs operation = new AddElementsFromHdfs.Builder() .addInputMapperPair(new Path(inputDir).toString(), TextMapperGeneratorImpl.class.getName()) .outputPath(outputDir) .failurePath(failureDir) .jobInitialiser(new TextJobInitialiser()) .option(HBaseStoreConstants.OPERATION_HDFS_STAGING_PATH, stagingDir) .build(); final HBaseStore store = new SingleUseMiniHBaseStore(); final Schema schema = Schema.fromJson(StreamUtil.schemas(getClass())); final HBaseProperties properties = HBaseProperties.loadStoreProperties(StreamUtil.storeProps(getClass())); store.initialise("graphId", schema, properties); given(job.getConfiguration()).willReturn(localConf); // When factory.setupJob(job, operation, TextMapperGeneratorImpl.class.getName(), store); // Then verify(job).setJarByClass(factory.getClass()); verify(job).setJobName("Ingest HDFS data: Generator=" + TextMapperGeneratorImpl.class.getName() + ", output=" + outputDir); verify(job).setMapperClass(AddElementsFromHdfsMapper.class); verify(job).setMapOutputKeyClass(ImmutableBytesWritable.class); verify(job).setMapOutputValueClass(KeyValue.class); verify(job).setReducerClass(AddElementsFromHdfsReducer.class); verify(job).setOutputKeyClass(ImmutableBytesWritable.class); verify(job).setOutputValueClass(KeyValue.class); verify(job).setOutputFormatClass(HFileOutputFormat2.class); assertEquals(fs.makeQualified(new Path(outputDir)).toString(), job.getConfiguration().get("mapreduce.output.fileoutputformat.outputdir")); verify(job).setNumReduceTasks(1); }
/** * Sets up the actual job. * @param conf The current configuration. * @param args The command line parameters. * @return The newly created job. * @throws IOException When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { TableName tableName = TableName.valueOf(args[0]); conf.set(TABLE_NAME, tableName.getNameAsString()); Path inputDir = new Path(args[1]); Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName)); job.setJarByClass(Importer.class); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormatClass(SequenceFileInputFormat.class); String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY); // make sure we get the filter in the jars try { Class<? extends Filter> filter = conf.getClass(FILTER_CLASS_CONF_KEY, null, Filter.class); if (filter != null) { TableMapReduceUtil.addDependencyJars(conf, filter); } } catch (Exception e) { throw new IOException(e); } if (hfileOutPath != null) { job.setMapperClass(KeyValueImporter.class); try (Connection conn = ConnectionFactory.createConnection(conf); Table table = conn.getTable(tableName); RegionLocator regionLocator = conn.getRegionLocator(tableName)){ job.setReducerClass(KeyValueSortReducer.class); Path outputDir = new Path(hfileOutPath); FileOutputFormat.setOutputPath(job, outputDir); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); HFileOutputFormat2.configureIncrementalLoad(job, table, regionLocator); TableMapReduceUtil.addDependencyJars(job.getConfiguration(), com.google.common.base.Preconditions.class); } } else { // No reducers. Just write straight to table. Call initTableReducerJob // because it sets up the TableOutputFormat. job.setMapperClass(Importer.class); TableMapReduceUtil.initTableReducerJob(tableName.getNameAsString(), null, job); job.setNumReduceTasks(0); } return job; }