AbsaOSS · yruslan · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
@@ -20,10 +20,10 @@ import za.co.absa.cobrix.cobol.parser.CopybookParser.CopybookAST
 import za.co.absa.cobrix.cobol.parser.ast.datatype.AlphaNumeric
 import za.co.absa.cobrix.cobol.parser.ast.{Group, Primitive, Statement}
 import za.co.absa.cobrix.cobol.parser.decoders.StringDecoders
-import za.co.absa.cobrix.cobol.parser.decoders.StringDecoders.KeepAll
 import za.co.absa.cobrix.cobol.parser.encoding._
 import za.co.absa.cobrix.cobol.parser.policies.DebugFieldsPolicy
 import za.co.absa.cobrix.cobol.parser.policies.DebugFieldsPolicy.DebugFieldsPolicy
+import za.co.absa.cobrix.cobol.utils.StringUtils
 
 import scala.collection.mutable.ArrayBuffer
 
@@ -47,7 +47,7 @@ class DebugFieldsAdder(debugFieldsPolicy: DebugFieldsPolicy) extends AstTransfor
       }
 
       val debugDecoder = debugFieldsPolicy match {
-        case DebugFieldsPolicy.HexValue => StringDecoders.decodeHex _
+        case DebugFieldsPolicy.HexValue => StringUtils.convertArrayToHex _
         case DebugFieldsPolicy.RawValue => StringDecoders.decodeRaw _
         case DebugFieldsPolicy.StringValue => (a: Array[Byte]) => new String(a)
         case _ => throw new IllegalStateException(s"Unexpected debug fields policy: $debugFieldsPolicy.")

@@ -23,6 +23,7 @@ import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat.FloatingPoint
 import za.co.absa.cobrix.cobol.parser.encoding._
 import za.co.absa.cobrix.cobol.parser.encoding.codepage.{CodePage, CodePageCommon}
 import za.co.absa.cobrix.cobol.parser.position.Position
+import za.co.absa.cobrix.cobol.utils.StringUtils
 
 import java.nio.charset.{Charset, StandardCharsets}
 import scala.util.control.NonFatal
@@ -94,7 +95,7 @@ object DecoderSelector {
       case UTF16 =>
         StringDecoders.decodeUtf16String(_, getStringStrimmingType(stringTrimmingPolicy), isUtf16BigEndian, improvedNullDetection)
       case HEX =>
-        StringDecoders.decodeHex
+        StringUtils.convertArrayToHex
       case RAW =>
         StringDecoders.decodeRaw
     }

@@ -33,9 +33,6 @@ object StringDecoders {
   val TrimBoth  = 4
   val KeepAll   = 5
 
-  // Characters used for HEX conversion
-  private val HEX_ARRAY = "0123456789ABCDEF".toCharArray
-
   /**
     * A decoder for any EBCDIC string fields (alphabetical or any char)
     *
@@ -125,24 +122,6 @@ object StringDecoders {
     }
   }
 
-  /**
-    * A decoder for representing bytes as hex strings
-    *
-    * @param bytes A byte array that represents the binary data
-    * @return A HEX string representation of the binary data
-    */
-  final def decodeHex(bytes: Array[Byte]): String = {
-    val hexChars = new Array[Char](bytes.length * 2)
-    var i = 0
-    while (i < bytes.length) {
-      val v = bytes(i) & 0xFF
-      hexChars(i * 2) = HEX_ARRAY(v >>> 4)
-      hexChars(i * 2 + 1) = HEX_ARRAY(v & 0x0F)
-      i += 1
-    }
-    new String(hexChars)
-  }
-
   /**
     * A decoder that doesn't decode, but just passes the bytes the way they are.
     *

@@ -21,6 +21,7 @@ import za.co.absa.cobrix.cobol.parser.ast.Primitive
 import za.co.absa.cobrix.cobol.reader.iterator.RecordLengthExpression
 import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
 import za.co.absa.cobrix.cobol.reader.validator.ReaderParametersValidator
+import za.co.absa.cobrix.cobol.utils.StringUtils
 
 import scala.util.Try
 
@@ -127,7 +128,7 @@ class FixedWithRecordLengthExprRawRecordExtractor(ctx: RawRecordContext,
         case l: Long       => l.toInt
         case s: String     => Try{ s.toInt }.getOrElse(throw new IllegalStateException(s"Record length value of the field ${lengthAST.name} must be an integral type, encountered: '$s'."))
         case d: BigDecimal => d.toInt
-        case null          => throw new IllegalStateException(s"Null encountered as a record length field (offset: $byteIndex, raw value: ${getBytesAsHexString(binaryDataStart)}).")
+        case null          => throw new IllegalStateException(s"Null encountered as a record length field (offset: $byteIndex, raw value: ${StringUtils.convertArrayToHex(binaryDataStart)}).")
         case _             => throw new IllegalStateException(s"Record length value of the field ${lengthAST.name} must be an integral type.")
       }
     } else {
@@ -136,7 +137,7 @@ class FixedWithRecordLengthExprRawRecordExtractor(ctx: RawRecordContext,
         case l: Long       => getRecordLengthFromMapping(l.toString)
         case d: BigDecimal => getRecordLengthFromMapping(d.toString())
         case s: String     => getRecordLengthFromMapping(s)
-        case null          => defaultRecordLength.getOrElse(throw new IllegalStateException(s"Null encountered as a record length field (offset: $byteIndex, raw value: ${getBytesAsHexString(binaryDataStart)})."))
+        case null          => defaultRecordLength.getOrElse(throw new IllegalStateException(s"Null encountered as a record length field (offset: $byteIndex, raw value: ${StringUtils.convertArrayToHex(binaryDataStart)})."))
         case _             => throw new IllegalStateException(s"Record length value of the field ${lengthAST.name} must be an integral type.")
       }
     }
@@ -150,10 +151,6 @@ class FixedWithRecordLengthExprRawRecordExtractor(ctx: RawRecordContext,
     }
   }
 
-  final private def getBytesAsHexString(bytes: Array[Byte]): String = {
-    bytes.map("%02X" format _).mkString
-  }
-
   private def fetchRecordUsingRecordLengthFieldExpression(expr: RecordLengthExpression): Option[Array[Byte]] = {
     val lengthFieldBlock = expr.requiredBytesToread
     val evaluator = expr.evaluator

@@ -23,6 +23,7 @@ import za.co.absa.cobrix.cobol.parser.common.Constants
 import za.co.absa.cobrix.cobol.parser.encoding.RAW
 import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy
 import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaRetentionPolicy
+import za.co.absa.cobrix.cobol.utils.StringUtils
 
 import scala.collection.mutable
 import scala.collection.mutable.{ArrayBuffer, ListBuffer}
@@ -35,21 +36,22 @@ object RecordExtractors {
   /**
     * This method extracts a record from the specified array of bytes. The copybook for the record needs to be already parsed.
     *
-    * @param ast                     The parsed copybook.
-    * @param data                    The data bits containing the record.
-    * @param offsetBytes             The offset to the beginning of the record (in bits).
-    * @param policy                  A schema retention policy to be applied to the extracted record.
-    * @param variableLengthOccurs    If true, OCCURS DEPENDING ON data size will depend on the number of elements.
-    * @param generateRecordId        If true, a record id field will be added as the first field of the record.
-    * @param generateRecordBytes     If true, a record bytes field will be added at the beginning of each record.
-    * @param generateCorruptFields   If true, a corrupt fields field will be appended to the end of the schema.
-    * @param segmentLevelIds         Segment ids to put to the extracted record if id generation it turned on.
-    * @param fileId                  A file id to be put to the extractor record if generateRecordId == true.
-    * @param recordId                The record id to be saved to the record id field.
-    * @param activeSegmentRedefine   An active segment redefine (the one that will be parsed).
-    *                                All other segment redefines will be skipped.
-    * @param generateInputFileField  if true, a field containing input file name will be generated
-    * @param inputFileName           An input file name to put if its generation is needed
+    * @param ast                        The parsed copybook.
+    * @param data                       The data bits containing the record.
+    * @param offsetBytes                The offset to the beginning of the record (in bits).
+    * @param policy                     A schema retention policy to be applied to the extracted record.
+    * @param variableLengthOccurs       If true, OCCURS DEPENDING ON data size will depend on the number of elements.
+    * @param generateRecordId           If true, a record id field will be added as the first field of the record.
+    * @param generateRecordBytes        If true, a record bytes field will be added at the beginning of each record.
+    * @param generateCorruptFields      If true, a corrupt fields field will be appended to the end of the schema.
+    * @param generateCorruptFieldsAsHex If true, corrupt fields will be generated as hex strings, otherwise they will be generated as binary data. This parameter is only relevant if generateCorruptFields is true.
+    * @param segmentLevelIds            Segment ids to put to the extracted record if id generation it turned on.
+    * @param fileId                     A file id to be put to the extractor record if generateRecordId == true.
+    * @param recordId                   The record id to be saved to the record id field.
+    * @param activeSegmentRedefine      An active segment redefine (the one that will be parsed).
+    *                                   All other segment redefines will be skipped.
+    * @param generateInputFileField     if true, a field containing input file name will be generated
+    * @param inputFileName              An input file name to put if its generation is needed
     * @return An Array[Any] object corresponding to the record schema.
     */
   @throws(classOf[IllegalStateException])
@@ -62,6 +64,7 @@ object RecordExtractors {
                                   generateRecordId: Boolean = false,
                                   generateRecordBytes: Boolean = false,
                                   generateCorruptFields: Boolean = false,
+                                  generateCorruptFieldsAsHex: Boolean = false,
                                   segmentLevelIds: List[String] = Nil,
                                   fileId: Int = 0,
                                   recordId: Long = 0,
@@ -213,7 +216,7 @@ object RecordExtractors {
       policy
     }
 
-    applyRecordPostProcessing(ast, records.toList, effectiveSchemaRetentionPolicy, generateRecordId, generateRecordBytes, generateCorruptFields, segmentLevelIds, fileId, recordId, data.length, data, generateInputFileField, inputFileName, corruptFields, handler)
+    applyRecordPostProcessing(ast, records.toList, effectiveSchemaRetentionPolicy, generateRecordId, generateRecordBytes, generateCorruptFields, generateCorruptFieldsAsHex, segmentLevelIds, fileId, recordId, data.length, data, generateInputFileField, inputFileName, corruptFields, handler)
   }
 
   /**
@@ -433,7 +436,7 @@ object RecordExtractors {
       policy
     }
 
-    applyRecordPostProcessing(ast, records.toList, effectiveSchemaRetentionPolicy, generateRecordId, generateRecordBytes = false, generateCorruptFields = false,  Nil, fileId, recordId, recordLength, Array.empty[Byte], generateInputFileField = generateInputFileField, inputFileName, null, handler)
+    applyRecordPostProcessing(ast, records.toList, effectiveSchemaRetentionPolicy, generateRecordId, generateRecordBytes = false, generateCorruptFields = false, generateCorruptFieldsAsHex = false, Nil, fileId, recordId, recordLength, Array.empty[Byte], generateInputFileField = generateInputFileField, inputFileName, null, handler)
   }
 
   /**
@@ -449,16 +452,18 @@ object RecordExtractors {
     * Combinations of the listed transformations are supported.
     * </p>
     *
-    * @param ast                     The parsed copybook
-    * @param records                 The array of [[T]] object for each Group of the copybook
-    * @param generateRecordId        If true a record id field will be added as the first field of the record.
-    * @param generateRecordBytes     If true a record bytes field will be added at the beginning of the record.
-    * @param generateCorruptFields   If true,a corrupt fields field will be appended to the end of the schema.
-    * @param fileId                  The file id to be saved to the file id field
-    * @param recordId                The record id to be saved to the record id field
-    * @param recordByteLength        The length of the record
-    * @param generateInputFileField  if true, a field containing input file name will be generated
-    * @param inputFileName           An input file name to put if its generation is needed
+    * @param ast                        The parsed copybook
+    * @param records                    The array of [[T]] object for each Group of the copybook
+    * @param generateRecordId           If true a record id field will be added as the first field of the record.
+    * @param generateRecordBytes        If true a record bytes field will be added at the beginning of the record.
+    * @param generateCorruptFields      If true,a corrupt fields field will be appended to the end of the schema.
+    * @param generateCorruptFieldsAsHex If true, corrupt fields will be generated as hex strings, otherwise they will be generated as binary data. This parameter is only relevant if generateCorruptFields is true.
+    * @param segmentLevelIds            Segment ids to put to the extracted record if id generation it turned on.
+    * @param fileId                     The file id to be saved to the file id field
+    * @param recordId                   The record id to be saved to the record id field
+    * @param recordByteLength           The length of the record
+    * @param generateInputFileField     if true, a field containing input file name will be generated
+    * @param inputFileName              An input file name to put if its generation is needed
     * @return A [[T]] object corresponding to the record schema
     */
   private def applyRecordPostProcessing[T](
@@ -468,6 +473,7 @@ object RecordExtractors {
                                             generateRecordId: Boolean,
                                             generateRecordBytes: Boolean,
                                             generateCorruptFields: Boolean,
+                                            generateCorruptFieldsAsHex: Boolean = false,
                                             segmentLevelIds: List[String],
                                             fileId: Int,
                                             recordId: Long,
@@ -515,7 +521,12 @@ object RecordExtractors {
       val ar = new Array[Any](len)
       var i = 0
       while (i < len) {
-        val r = handler.create(Array[Any](corruptFields(i).fieldName, corruptFields(i).rawValue), corruptFieldsGroup)
+        val r = if (generateCorruptFieldsAsHex) {
+          val hex = StringUtils.convertArrayToHex(corruptFields(i).rawValue)
+          handler.create(Array[Any](corruptFields(i).fieldName, hex), corruptFieldsGroup)
+        } else {
+          handler.create(Array[Any](corruptFields(i).fieldName, corruptFields(i).rawValue), corruptFieldsGroup)
+        }
         ar(i) = r
         i += 1
       }
@@ -525,7 +536,7 @@ object RecordExtractors {
     // toList() is a constant time operation, and List implements immutable Seq, which is exactly what is needed here.
     outputRecords.toList
   }
-
+  
   /**
     * Constructs a Group object representing corrupt fields. It is only needed for constructing records that require field names,
     * such as JSON. Field sizes and encoding do not really matter

@@ -18,7 +18,7 @@ package za.co.absa.cobrix.cobol.reader.iterator
 
 import za.co.absa.cobrix.cobol.internal.Logging
 import za.co.absa.cobrix.cobol.reader.extractors.record.{RecordExtractors, RecordHandler}
-import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
+import za.co.absa.cobrix.cobol.reader.parameters.{CorruptFieldsPolicy, ReaderParameters}
 import za.co.absa.cobrix.cobol.reader.schema.CobolSchema
 import za.co.absa.cobrix.cobol.reader.validator.ReaderParametersValidator
 
@@ -47,6 +47,8 @@ class FixedLenNestedRowIterator[T: ClassTag](
   private val segmentIdFilter = readerProperties.multisegment.flatMap(_.segmentIdFilter)
   private val segmentRedefineMap = readerProperties.multisegment.map(_.segmentIdRedefineMap).getOrElse(HashMap[String, String]())
   private val segmentRedefineAvailable = segmentRedefineMap.nonEmpty
+  private val generateCorruptFields = readerProperties.corruptFieldsPolicy != CorruptFieldsPolicy.Disabled
+  private val generateCorruptFieldsAsHex = readerProperties.corruptFieldsPolicy == CorruptFieldsPolicy.Hex
 
   override def hasNext: Boolean = {
     val correctOffset = if (singleRecordOnly) {
@@ -90,7 +92,8 @@ class FixedLenNestedRowIterator[T: ClassTag](
       readerProperties.schemaPolicy,
       readerProperties.variableSizeOccurs,
       generateRecordBytes = readerProperties.generateRecordBytes,
-      generateCorruptFields = readerProperties.generateCorruptFields,
+      generateCorruptFields = generateCorruptFields,
+      generateCorruptFieldsAsHex = generateCorruptFieldsAsHex,
       activeSegmentRedefine = activeSegmentRedefine,
       handler = handler
     )

@@ -20,7 +20,7 @@ import za.co.absa.cobrix.cobol.parser.Copybook
 import za.co.absa.cobrix.cobol.parser.headerparsers.RecordHeaderParser
 import za.co.absa.cobrix.cobol.reader.extractors.raw.RawRecordExtractor
 import za.co.absa.cobrix.cobol.reader.extractors.record.{RecordExtractors, RecordHandler}
-import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
+import za.co.absa.cobrix.cobol.reader.parameters.{CorruptFieldsPolicy, ReaderParameters}
 import za.co.absa.cobrix.cobol.reader.stream.SimpleStream
 
 import scala.collection.immutable.HashMap
@@ -60,6 +60,8 @@ final class VarLenNestedIterator[T: ClassTag](cobolSchema: Copybook,
   private val segmentRedefineMap = readerProperties.multisegment.map(_.segmentIdRedefineMap).getOrElse(HashMap[String, String]())
   private val segmentRedefineAvailable = segmentRedefineMap.nonEmpty
   private val generateInputFileName = readerProperties.inputFileNameColumn.nonEmpty
+  private val generateCorruptFields = readerProperties.corruptFieldsPolicy != CorruptFieldsPolicy.Disabled
+  private val generateCorruptFieldsAsHex = readerProperties.corruptFieldsPolicy == CorruptFieldsPolicy.Hex
 
   fetchNext()
 
@@ -99,7 +101,8 @@ final class VarLenNestedIterator[T: ClassTag](cobolSchema: Copybook,
                 readerProperties.variableSizeOccurs,
                 readerProperties.generateRecordId,
                 readerProperties.generateRecordBytes,
-                readerProperties.generateCorruptFields,
+                generateCorruptFields,
+                generateCorruptFieldsAsHex,
                 segmentLevelIds,
                 fileId,
                 rawRecordIterator.getRecordIndex,

@@ -400,6 +400,16 @@ object CobolParametersParser extends Logging {
     else
       None
 
+    val corruptFieldsPolicy = if (parameters.generateCorruptFields) {
+      if (parameters.decodeBinaryAsHex) {
+        CorruptFieldsPolicy.Hex
+      } else {
+        CorruptFieldsPolicy.Binary
+      }
+    } else {
+      CorruptFieldsPolicy.Disabled
+    }
+
     ReaderParameters(
       recordFormat = parameters.recordFormat,
       isEbcdic = parameters.isEbcdic,
@@ -433,7 +443,7 @@ object CobolParametersParser extends Logging {
       fileEndOffset = varLenParams.fileEndOffset,
       generateRecordId = varLenParams.generateRecordId,
       generateRecordBytes = parameters.generateRecordBytes,
-      generateCorruptFields = parameters.generateCorruptFields,
+      corruptFieldsPolicy = corruptFieldsPolicy,
       schemaPolicy = parameters.schemaRetentionPolicy,
       stringTrimmingPolicy = parameters.stringTrimmingPolicy,
       isDisplayAlwaysString = parameters.isDisplayAlwaysString,