#797 Promote field extraction method from Copybook class to the companion object.

yruslan · yruslan · commit e7bece4274fb · 2026-03-10T08:59:42.000+01:00
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/Copybook.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/Copybook.scala
@@ -90,7 +90,7 @@ class Copybook(val ast: CopybookAST) extends Logging with Serializable {
   def getFieldValueByName(fieldName: String, recordBytes: Array[Byte], startOffset: Int = 0): Any = {
     val primitive = getPrimitiveFieldByName(fieldName)
 
-    extractPrimitiveField(primitive, recordBytes, startOffset)
+    getPrimitiveField(primitive, recordBytes, startOffset)
   }
 
   /**
@@ -203,23 +203,6 @@ class Copybook(val ast: CopybookAST) extends Logging with Serializable {
     }
   }
 
-  /**
-    * Get value of a field of the copybook record by the AST object of the field
-    *
-    * Nested field names can contain '.' to identify the exact field.
-    * If the field name is unique '.' is not required.
-    *
-    * @param field The AST object of the field
-    * @param bytes Binary encoded data of the record
-    * @param startOffset An offset to the beginning of the field in the data (in bytes).
-    * @return The value of the field
-    *
-    */
-  def extractPrimitiveField(field: Primitive, bytes: Array[Byte], startOffset: Int = 0): Any = {
-    val slicedBytes = bytes.slice(field.binaryProperties.offset + startOffset, field.binaryProperties.offset + startOffset + field.binaryProperties.actualSize)
-    field.decodeTypeValue(0, slicedBytes)
-  }
-
   /** This routine is used for testing by generating a layout position information to compare with mainframe output */
   def generateRecordLayoutPositions(): String = {
     var fieldCounter: Int = 0
@@ -431,6 +414,28 @@ object Copybook {
     new Copybook(schema)
   }
 
+  /**
+    * Get value of a field of the copybook record by the AST object of the field
+    *
+    * Nested field names can contain '.' to identify the exact field.
+    * If the field name is unique '.' is not required.
+    *
+    * @param field The AST object of the field
+    * @param bytes Binary encoded data of the record
+    * @param startOffset An offset to the beginning of the field in the data (in bytes).
+    * @return The value of the field
+    *
+    */
+  def getPrimitiveField(field: Primitive, bytes: Array[Byte], startOffset: Int = 0): Any = {
+    val slicedBytes = bytes.slice(field.binaryProperties.offset + startOffset, field.binaryProperties.offset + startOffset + field.binaryProperties.actualSize)
+    field.decodeTypeValue(0, slicedBytes)
+  }
+
+  /** Same as getPrimitiveField(). The original method is left for backwards compatibility. */
+  def extractPrimitiveField(field: Primitive, bytes: Array[Byte], startOffset: Int = 0): Any = {
+    getPrimitiveField(field, bytes, startOffset)
+  }
+
   /**
     * Set value of a field of the copybook record by the AST object of the field
     *
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/raw/FixedWithRecordLengthExprRawRecordExtractor.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/raw/FixedWithRecordLengthExprRawRecordExtractor.scala
@@ -17,6 +17,7 @@
 package za.co.absa.cobrix.cobol.reader.extractors.raw
 
 import org.slf4j.LoggerFactory
+import za.co.absa.cobrix.cobol.parser.Copybook
 import za.co.absa.cobrix.cobol.parser.ast.Primitive
 import za.co.absa.cobrix.cobol.reader.iterator.RecordLengthExpression
 import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
@@ -123,7 +124,7 @@ class FixedWithRecordLengthExprRawRecordExtractor(ctx: RawRecordContext,
 
   final private def getRecordLengthFromField(lengthAST: Primitive, binaryDataStart: Array[Byte]): Int = {
     val length = if (isLengthMapEmpty) {
-      ctx.copybook.extractPrimitiveField(lengthAST, binaryDataStart, readerProperties.startOffset) match {
+      Copybook.getPrimitiveField(lengthAST, binaryDataStart, readerProperties.startOffset) match {
         case i: Int        => i
         case l: Long       => l.toInt
         case s: String     => Try{ s.toInt }.getOrElse(throw new IllegalStateException(s"Record length value of the field ${lengthAST.name} must be an integral type, encountered: '$s'."))
@@ -132,7 +133,7 @@ class FixedWithRecordLengthExprRawRecordExtractor(ctx: RawRecordContext,
         case _             => throw new IllegalStateException(s"Record length value of the field ${lengthAST.name} must be an integral type.")
       }
     } else {
-      ctx.copybook.extractPrimitiveField(lengthAST, binaryDataStart, readerProperties.startOffset) match {
+      Copybook.getPrimitiveField(lengthAST, binaryDataStart, readerProperties.startOffset) match {
         case i: Int        => getRecordLengthFromMapping(i.toString)
         case l: Long       => getRecordLengthFromMapping(l.toString)
         case d: BigDecimal => getRecordLengthFromMapping(d.toString())
@@ -165,7 +166,7 @@ class FixedWithRecordLengthExprRawRecordExtractor(ctx: RawRecordContext,
 
     expr.fields.foreach{
       case (name, field) =>
-        val obj = ctx.copybook.extractPrimitiveField(field, binaryDataStart, readerProperties.startOffset)
+        val obj = Copybook.getPrimitiveField(field, binaryDataStart, readerProperties.startOffset)
         try {
           obj match {
             case i: Int    => evaluator.setValue(name, i)
@@ -194,7 +195,7 @@ class FixedWithRecordLengthExprRawRecordExtractor(ctx: RawRecordContext,
 
   private def getSegmentId(data: Array[Byte]): Option[String] = {
     segmentIdField.map(field => {
-      val fieldValue = ctx.copybook.extractPrimitiveField(field, data, readerProperties.startOffset)
+      val fieldValue = Copybook.getPrimitiveField(field, data, readerProperties.startOffset)
       if (fieldValue == null) {
         log.error(s"An unexpected null encountered for segment id at $byteIndex")
         ""
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/index/IndexGenerator.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/index/IndexGenerator.scala
@@ -99,14 +99,14 @@ object IndexGenerator extends Logging {
       } else {
         if (isValid) {
           if (isReallyHierarchical && rootRecordId.isEmpty) {
-            val curSegmentId = getSegmentId(copybook.get, segmentField.get, record)
+            val curSegmentId = getSegmentId(segmentField.get, record)
             if ((curSegmentId.nonEmpty && rootSegmentIds.isEmpty)
               || (rootSegmentIds.nonEmpty && rootSegmentIds.contains(curSegmentId))) {
               rootRecordId = curSegmentId
             }
           }
           if (canSplit && needSplit(recordsInChunk, bytesInChunk)) {
-            if (!isReallyHierarchical || isSegmentGoodForSplit(rootSegmentIds, copybook.get, segmentField.get, record)) {
+            if (!isReallyHierarchical || isSegmentGoodForSplit(rootSegmentIds, segmentField.get, record)) {
               val indexEntry = SparseIndexEntry(byteIndex, -1, fileId, recordIndex)
               val len = index.length
               // Do not add an entry if we are still at the same position as the previous entry.
@@ -157,15 +157,14 @@ object IndexGenerator extends Logging {
   }
 
   private def isSegmentGoodForSplit(rootSegmentIds: List[String],
-                                    copybook: Copybook,
                                     segmentField: Primitive,
                                     record: Array[Byte]): Boolean = {
-    val segmentId = getSegmentId(copybook, segmentField, record)
+    val segmentId = getSegmentId(segmentField, record)
     rootSegmentIds.contains(segmentId)
   }
 
-  private def getSegmentId(copybook: Copybook, segmentIdField: Primitive, data: Array[Byte]): String = {
-    val v = copybook.extractPrimitiveField(segmentIdField, data)
+  private def getSegmentId(segmentIdField: Primitive, data: Array[Byte]): String = {
+    val v = Copybook.getPrimitiveField(segmentIdField, data)
     if (v == null) "" else v.toString.trim
   }
 }
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/iterator/FixedLenNestedRowIterator.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/iterator/FixedLenNestedRowIterator.scala
@@ -17,6 +17,7 @@
 package za.co.absa.cobrix.cobol.reader.iterator
 
 import za.co.absa.cobrix.cobol.internal.Logging
+import za.co.absa.cobrix.cobol.parser.Copybook
 import za.co.absa.cobrix.cobol.reader.extractors.record.{RecordExtractors, RecordHandler}
 import za.co.absa.cobrix.cobol.reader.parameters.{CorruptFieldsPolicy, ReaderParameters}
 import za.co.absa.cobrix.cobol.reader.schema.CobolSchema
@@ -108,7 +109,7 @@ class FixedLenNestedRowIterator[T: ClassTag](
 
   private def getSegmentId(data: Array[Byte], offset: Int): Option[String] = {
     segmentIdField.map(field => {
-      val fieldValue = cobolSchema.copybook.extractPrimitiveField(field, data, offset)
+      val fieldValue = Copybook.getPrimitiveField(field, data, offset)
       if (fieldValue == null) {
         logger.error(s"An unexpected null encountered for segment id at $byteIndex")
         ""
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/iterator/VRLRecordReader.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/iterator/VRLRecordReader.scala
@@ -18,10 +18,9 @@ package za.co.absa.cobrix.cobol.reader.iterator
 
 import za.co.absa.cobrix.cobol.internal.Logging
 import za.co.absa.cobrix.cobol.parser.Copybook
-import za.co.absa.cobrix.cobol.parser.ast.Primitive
 import za.co.absa.cobrix.cobol.parser.headerparsers.RecordHeaderParser
-import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
 import za.co.absa.cobrix.cobol.reader.extractors.raw.RawRecordExtractor
+import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
 import za.co.absa.cobrix.cobol.reader.stream.SimpleStream
 import za.co.absa.cobrix.cobol.reader.validator.ReaderParametersValidator
 
@@ -143,7 +142,7 @@ class VRLRecordReader(cobolSchema: Copybook,
 
   private def getSegmentId(data: Array[Byte]): Option[String] = {
     segmentIdField.map(field => {
-      val fieldValue = cobolSchema.extractPrimitiveField(field, data, readerProperties.startOffset)
+      val fieldValue = Copybook.getPrimitiveField(field, data, readerProperties.startOffset)
       if (fieldValue == null) {
         logger.error(s"An unexpected null encountered for segment id at $byteIndex")
         ""
diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/extract/BinaryExtractorSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/extract/BinaryExtractorSpec.scala
@@ -17,12 +17,12 @@
 package za.co.absa.cobrix.cobol.parser.extract
 
 import org.scalatest.funsuite.AnyFunSuite
-import za.co.absa.cobrix.cobol.parser.CopybookParser
 import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, CobolType}
 import za.co.absa.cobrix.cobol.parser.ast.{BinaryProperties, Group, Primitive}
 import za.co.absa.cobrix.cobol.parser.decoders.DecoderSelector
 import za.co.absa.cobrix.cobol.parser.encoding.{EBCDIC, EncoderSelector}
 import za.co.absa.cobrix.cobol.parser.policies.StringTrimmingPolicy
+import za.co.absa.cobrix.cobol.parser.{Copybook, CopybookParser}
 
 class BinaryExtractorSpec extends AnyFunSuite {
 
@@ -113,15 +113,15 @@ class BinaryExtractorSpec extends AnyFunSuite {
     0x00.toByte, 0x00.toByte, 0x2F.toByte
   )
 
-  val copybook = CopybookParser.parseTree(copyBookContents)
+  val copybook: Copybook = CopybookParser.parseTree(copyBookContents)
   val startOffset: Int = 0
 
   test("Test extract primitive field") {
 
     // using getFieldByName
     val statement = copybook.getFieldByName("ID")
     val field: Primitive = statement.asInstanceOf[Primitive]
-    val result: Any = copybook.extractPrimitiveField(field, bytes, startOffset)
+    val result: Any = Copybook.getPrimitiveField(field, bytes, startOffset)
     assert(result.asInstanceOf[Int] === 6)
 
     // traverse AST and extract all primitives to map
@@ -130,7 +130,7 @@ class BinaryExtractorSpec extends AnyFunSuite {
     def traverseAst(group: Group): Unit = {
       for (child <- group.children) {
         if (child.isInstanceOf[Primitive]) {
-          extractedData += (child.name -> copybook.extractPrimitiveField(child.asInstanceOf[Primitive],
+          extractedData += (child.name -> Copybook.extractPrimitiveField(child.asInstanceOf[Primitive],
             bytes, startOffset))
         } else {
           assert(child.isInstanceOf[Group] === true)
@@ -162,7 +162,7 @@ class BinaryExtractorSpec extends AnyFunSuite {
 
     val primitive: Primitive = Primitive(level, name, name, lineNumber, dataType, redefines, isRedefined,
       occurs, to, dependingOn, Map(), isDependee, isFiller, DecoderSelector.getDecoder(dataType), EncoderSelector.getEncoder(dataType), binaryProperties)(None)
-    val result2: Any = copybook.extractPrimitiveField(primitive, bytes, startOffset)
+    val result2: Any = Copybook.extractPrimitiveField(primitive, bytes, startOffset)
     assert(result2.asInstanceOf[String] === "EXAMPLE4")
   }
 
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/NestedRecordCombiner.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/NestedRecordCombiner.scala
@@ -278,7 +278,7 @@ object NestedRecordCombiner {
         }
       }
     }.getOrElse {
-      // Dependee fields need not to be defines in Spark schema.
+      // Dependee fields need not be defined in Spark schema.
       if (p.isDependee) {
         PrimitiveDependeeField(addDependee())
       } else {
@@ -369,6 +369,9 @@ object NestedRecordCombiner {
 
       // ── Primitive which has an OCCURS DEPENDS ON ─────────────────────────────
       case PrimitiveDependeeField(spec) =>
+        // NOTE: baseOffset is mutated here for each row. This is safe because rows
+        // are processed sequentially within mapPartitions, and the offset is always
+        // updated before being read in subsequent array-element writes.
         spec.baseOffset = currentOffset
         spec.cobolField.binaryProperties.actualSize
 

Original file line number	Diff line number	Diff line change
`@@ -99,14 +99,14 @@ object IndexGenerator extends Logging {`
`99`	`99`	`} else {`
`100`	`100`	`if (isValid) {`
`101`	`101`	`if (isReallyHierarchical && rootRecordId.isEmpty) {`
`102`		`- val curSegmentId = getSegmentId(copybook.get, segmentField.get, record)`
	`102`	`+ val curSegmentId = getSegmentId(segmentField.get, record)`
`103`	`103`	`if ((curSegmentId.nonEmpty && rootSegmentIds.isEmpty)`
`104`	`104`	`\|\| (rootSegmentIds.nonEmpty && rootSegmentIds.contains(curSegmentId))) {`
`105`	`105`	`rootRecordId = curSegmentId`
`106`	`106`	`}`
`107`	`107`	`}`
`108`	`108`	`if (canSplit && needSplit(recordsInChunk, bytesInChunk)) {`
`109`		`- if (!isReallyHierarchical \|\| isSegmentGoodForSplit(rootSegmentIds, copybook.get, segmentField.get, record)) {`
	`109`	`+ if (!isReallyHierarchical \|\| isSegmentGoodForSplit(rootSegmentIds, segmentField.get, record)) {`
`110`	`110`	`val indexEntry = SparseIndexEntry(byteIndex, -1, fileId, recordIndex)`
`111`	`111`	`val len = index.length`
`112`	`112`	`// Do not add an entry if we are still at the same position as the previous entry.`
`@@ -157,15 +157,14 @@ object IndexGenerator extends Logging {`
`157`	`157`	`}`
`158`	`158`
`159`	`159`	`private def isSegmentGoodForSplit(rootSegmentIds: List[String],`
`160`		`- copybook: Copybook,`
`161`	`160`	`segmentField: Primitive,`
`162`	`161`	`record: Array[Byte]): Boolean = {`
`163`		`- val segmentId = getSegmentId(copybook, segmentField, record)`
	`162`	`+ val segmentId = getSegmentId(segmentField, record)`
`164`	`163`	`rootSegmentIds.contains(segmentId)`
`165`	`164`	`}`
`166`	`165`
`167`		`- private def getSegmentId(copybook: Copybook, segmentIdField: Primitive, data: Array[Byte]): String = {`
`168`		`- val v = copybook.extractPrimitiveField(segmentIdField, data)`
	`166`	`+ private def getSegmentId(segmentIdField: Primitive, data: Array[Byte]): String = {`
	`167`	`+ val v = Copybook.getPrimitiveField(segmentIdField, data)`
`169`	`168`	`if (v == null) "" else v.toString.trim`
`170`	`169`	`}`
`171`	`170`	`}`