Skip to content

Commit 3fd63a3

Browse files
committed
Added comments
1 parent 383f86f commit 3fd63a3

File tree

7 files changed

+139
-84
lines changed

7 files changed

+139
-84
lines changed

src/it/scala/com/qubole/spark/hiveacid/ReadSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ import org.scalatest._
2626

2727
import scala.util.control.NonFatal
2828

29-
@Ignore
29+
//@Ignore
3030
class ReadSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll {
3131

3232
val log: Logger = LogManager.getLogger(this.getClass)

src/main/scala/com/qubole/spark/hiveacid/HiveAcidDataSourceV2Reader.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ class HiveAcidDataSourceV2Reader
6060
if (dbName != null) {
6161
hiveAcidMetadata = HiveAcidMetadata.fromSparkSession(sparkSession, dbName + "." + tblName)
6262
} else {
63+
// If db name is null, default db is chosen.
6364
hiveAcidMetadata = HiveAcidMetadata.fromSparkSession(sparkSession, tblName)
6465
}
6566

src/main/scala/com/qubole/spark/hiveacid/reader/hive/HiveAcidReader.scala

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -118,19 +118,20 @@ extends CastSupport with Reader with Logging {
118118
hiveAcidMetadata.hTable.getParameters,
119119
colNames, colTypes) _
120120

121-
//TODO : Need to cahce it with some unique id.
121+
//TODO : Need to cache it with some unique id.
122122
val jobConf = new JobConf(_broadcastedHadoopConf.value.value)
123123
initializeJobConfFunc(jobConf)
124124

125+
//TODO:Can this be done parallely in multiple threads?
125126
val partitionArray = new java.util.ArrayList[InputPartition[ColumnarBatch]]
126127
val inputSplits = HiveAcidCommon.getInputSplits(jobConf, validWriteIds, 0,
127128
hiveAcidMetadata.isFullAcidTable, ifc)
128-
val reqFileds = hiveAcidMetadata.tableSchema.fields.filter(field =>
129+
val reqFields = hiveAcidMetadata.tableSchema.fields.filter(field =>
129130
readerOptions.requiredNonPartitionedColumns.contains(field.name))
130-
for (i <- 0 until inputSplits.size) {
131+
for (i <- inputSplits.indices) {
131132
partitionArray.add(new HiveAcidInputPartitionV2(inputSplits(i).asInstanceOf[HiveAcidPartition],
132133
sparkSession.sparkContext.broadcast(new SerializableConfiguration(jobConf)),
133-
partitionValues, reqFileds, hiveAcidMetadata.partitionSchema, hiveAcidMetadata.isFullAcidTable))
134+
partitionValues, reqFields, hiveAcidMetadata.partitionSchema, hiveAcidMetadata.isFullAcidTable))
134135
logInfo("getPartitions : Input split: " + inputSplits(i))
135136
}
136137
partitionArray
@@ -215,14 +216,12 @@ extends CastSupport with Reader with Logging {
215216

216217
val mutableRow = new SpecificInternalRow(hiveAcidMetadata.partitionSchema)
217218

218-
// Splits all attributes into two groups, partition key attributes and those
219-
// that are not. Attached indices indicate the position of each attribute in
220-
// the output schema.
221-
val (partitionKeyAttrs, nonPartitionKeyAttrs) =
222-
readerOptions.requiredAttributes.zipWithIndex.partition { case (attr, _) =>
219+
val partitionKeyAttrs =
220+
readerOptions.requiredAttributes.zipWithIndex.filter { attr =>
223221
readerOptions.partitionAttributes.contains(attr)
224222
}
225223

224+
//TODO : The partition values can be filled directly using hive acid batch reader.
226225
def fillPartitionKeys(rawPartValues: Array[String], row: InternalRow): Unit = {
227226
var offset = 0
228227
partitionKeyAttrs.foreach { case (attr, ordinal) =>

src/main/scala/com/qubole/spark/hiveacid/reader/v2/OrcColumnVector.java renamed to src/main/scala/com/qubole/spark/hiveacid/reader/v2/HiveAcidColumnVector.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,9 @@
3636
/**
3737
* A column vector class wrapping Hive's ColumnVector. Because Spark ColumnarBatch only accepts
3838
* Spark's vectorized.ColumnVector, this column vector is used to adapt Hive ColumnVector with
39-
* Spark ColumnarVector.
39+
* Spark ColumnarVector. This class is a copy of spark ColumnVector which is declared private.
4040
*/
41-
public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVector {
41+
public class HiveAcidColumnVector extends org.apache.spark.sql.vectorized.ColumnVector {
4242
private ColumnVector baseData;
4343
private LongColumnVector longData;
4444
private DoubleColumnVector doubleData;
@@ -49,7 +49,7 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto
4949

5050
private int batchSize;
5151

52-
OrcColumnVector(DataType type, ColumnVector vector) {
52+
HiveAcidColumnVector(DataType type, ColumnVector vector) {
5353
super(type);
5454

5555
if (type instanceof TimestampType) {

src/main/scala/com/qubole/spark/hiveacid/reader/v2/HiveAcidInputPartitionReaderV2.scala

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,8 @@ private[v2] class HiveAcidInputPartitionReaderV2(split: HiveAcidPartition,
3535
isFullAcidTable: Boolean)
3636
extends InputPartitionReader[ColumnarBatch] {
3737
//TODO : Need to get a unique id to cache the jobConf.
38-
private val jobConf : JobConf = new JobConf(broadcastedConf.value.value)
39-
private val sparkOrcColReader : OrcColumnarBatchReader =
40-
new OrcColumnarBatchReader(1024)
38+
private val jobConf = new JobConf(broadcastedConf.value.value)
39+
private val orcColumnarBatchReader = new OrcColumnarBatchReader(1024)
4140

4241
private def initReader() : Unit = {
4342
// Get the reader schema using the column names and types set in hive conf.
@@ -63,26 +62,27 @@ private[v2] class HiveAcidInputPartitionReaderV2(split: HiveAcidPartition,
6362
// Register the listener for closing the reader before init is done.
6463
val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
6564
val taskAttemptContext = new org.apache.hadoop.mapred.TaskAttemptContextImpl(jobConf, attemptId)
66-
val iter = new org.apache.spark.sql.execution.datasources.RecordReaderIterator(sparkOrcColReader)
65+
val iter = new org.apache.spark.sql.execution.datasources.RecordReaderIterator(orcColumnarBatchReader)
6766
Option(TaskContext.get()).foreach(_.addTaskCompletionListener[Unit](_ => iter.close()))
6867

69-
sparkOrcColReader.initialize(fileSplit, taskAttemptContext)
70-
sparkOrcColReader.initBatch(readerLocal.getSchema, requestedColIds,
68+
//TODO: Need to generalize it for supporting other kind of file format.
69+
orcColumnarBatchReader.initialize(fileSplit, taskAttemptContext)
70+
orcColumnarBatchReader.initBatch(readerLocal.getSchema, requestedColIds,
7171
requiredFields, partitionSchema, partitionValues, isFullAcidTable && !fileSplit.isOriginal)
7272
}
7373
initReader()
7474

7575
@throws(classOf[IOException])
7676
override def next() : Boolean = {
77-
sparkOrcColReader.nextKeyValue()
77+
orcColumnarBatchReader.nextKeyValue()
7878
}
7979

8080
override def get () : ColumnarBatch = {
81-
sparkOrcColReader.getCurrentValue
81+
orcColumnarBatchReader.getCurrentValue
8282
}
8383

8484
@throws(classOf[IOException])
8585
override def close() : Unit = {
86-
sparkOrcColReader.close()
86+
orcColumnarBatchReader.close()
8787
}
8888
}

0 commit comments

Comments
 (0)