Reading Data from HBase table using Spark

mai 23, 2017

HBase is a data model, similar to Google’s big table, designed to provide quick random access to huge amounts of structured data. It leverages the fault tolerance provided by the Hadoop Distributed File System (HDFS).
HBase is column-oriented database built on top of the HDFS. It is an open-source and is horizontally scalable. HBase is used to access very large tables — billions of rows X millions of columns — atop clusters of commodity hardware.
Let us consider we have a table with name “student_info” within our HBase with the columnfamily “details” and column qualifiers “sid, firstName, lastName, branch, emailId”. Create a pojo class as below:

public class StudentBean  implements Serializable {
private int sid;
 private String firstName;
 private String lastName;
 private String branch;
private String emailId;
  // getters and setters methods
}

public class StudentBean implements Serializable {

private int sid;

private String firstName;

private String lastName;

private String branch;

private String emailId;

// getters and setters methods

}

Create JavaSparkContext object using SparkConf object
Read data from HBase table, providing the key space and table name using the code below:

public class Student {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("Read data From HBase table").setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
SQLContext sqlContext = new SQLContext(javaSparkContext); 
Configuration hbaseConfig = HBaseConfiguration.create();
JavaHBaseContext hBaseContext = new JavaHBaseContext(javaSparkContext, hbaseConfig);
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", "10.240.3.81");
conf.set("hbase.zookeeper.property.clientPort", "2181");
conf.set(TableInputFormat.INPUT_TABLE, "student_info");
JavaPairRDD<ImmutableBytesWritable, Result> hbaseRDD = javaSparkContext.newAPIHadoopRDD(conf, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
JavaRDD<studentbean> studentRDD = hbaseRDD.map(new Function<Tuple2<ImmutableBytesWritable,Result>, StudentBean  >() {
private static final long serialVersionUID = -2021713021648730786L;
public StudentBean  call(Tuple2<ImmutableBytesWritable, Result> tuple) throws Exception {
StudentBean  bean = new StudentBean  ();
try {
	Result result = tuple._2;
	bean.setRowKey(rowKey);
	bean.setFirstName(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("firstName"))));
	bean.setLastName(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("lastName"))));
	bean.setBranch(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("branch"))));
	bean.setEmailId(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("emailId"))));
	return bean;
	} catch(Exception e) {
		e.printStackTrace();
		return null;
	}
	}
});
}
}

public class Student {

public static void main(String[] args) {

SparkConf sparkConf = new SparkConf().setAppName("Read data From HBase table").setMaster("local");

JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);

SQLContext sqlContext = new SQLContext(javaSparkContext);

Configuration hbaseConfig = HBaseConfiguration.create();

JavaHBaseContext hBaseContext = new JavaHBaseContext(javaSparkContext, hbaseConfig);

Configuration conf = HBaseConfiguration.create();

conf.set("hbase.zookeeper.quorum", "10.240.3.81");

conf.set("hbase.zookeeper.property.clientPort", "2181");

conf.set(TableInputFormat.INPUT_TABLE, "student_info");

JavaPairRDD<ImmutableBytesWritable, Result> hbaseRDD = javaSparkContext.newAPIHadoopRDD(conf, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);

JavaRDD<StudentBean> studentRDD = hbaseRDD.map(new Function<Tuple2<ImmutableBytesWritable,Result>, StudentBean >() {

private static final long serialVersionUID = -2021713021648730786L;

public StudentBean call(Tuple2<ImmutableBytesWritable, Result> tuple) throws Exception {

StudentBean bean = new StudentBean ();

try {

Result result = tuple._2;

bean.setRowKey(rowKey);

bean.setFirstName(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("firstName"))));

bean.setLastName(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("lastName"))));

bean.setBranch(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("branch"))));

bean.setEmailId(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("emailId"))));

return bean;

} catch(Exception e) {

e.printStackTrace();

return null;

}

});

}

Now studentRDD will have all the records from the table in the form of Spark RDD. We can perform any aggregate or spark operation on top of this RDD.

Rechercher dans ce blog

Big data

Reading Data from HBase table using Spark

Commentaires

Enregistrer un commentaire

Posts les plus consultés de ce blog

Controlling Parallelism in Spark by controlling the input partitions by controlling the input partitions

Spark optimization

Spark performance optimization: shuffle tuning