@@ -722,6 +722,45 @@ parse_add_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
722722 return 0 ;
723723}
724724
725+ static int
726+ parse_add_substring (ReaderObj * self , _csvstate * module_state ,
727+ PyObject * lineobj , Py_ssize_t start , Py_ssize_t end )
728+ {
729+ int kind ;
730+ const void * data ;
731+ Py_UCS4 * dest ;
732+ Py_ssize_t field_limit ;
733+
734+ Py_ssize_t len = end - start ;
735+ if (len <= 0 ) {
736+ return 0 ;
737+ }
738+
739+ field_limit = FT_ATOMIC_LOAD_SSIZE_RELAXED (module_state -> field_limit );
740+ if (self -> field_len + len > field_limit ) {
741+ PyErr_Format (module_state -> error_obj ,
742+ "field larger than field limit (%zd)" ,
743+ field_limit );
744+ return -1 ;
745+ }
746+
747+ while (self -> field_len + len > self -> field_size ) {
748+ if (!parse_grow_buff (self ))
749+ return -1 ;
750+ }
751+
752+ kind = PyUnicode_KIND (lineobj );
753+ data = PyUnicode_DATA (lineobj );
754+ dest = self -> field + self -> field_len ;
755+
756+ for (Py_ssize_t i = 0 ; i < len ; ++ i ) {
757+ dest [i ] = PyUnicode_READ (kind , data , start + i );
758+ }
759+
760+ self -> field_len += len ;
761+ return 0 ;
762+ }
763+
725764static int
726765parse_process_char (ReaderObj * self , _csvstate * module_state , Py_UCS4 c )
727766{
@@ -923,11 +962,9 @@ Reader_iternext(PyObject *op)
923962 ReaderObj * self = _ReaderObj_CAST (op );
924963
925964 PyObject * fields = NULL ;
926- Py_UCS4 c ;
927- Py_ssize_t pos , linelen ;
928- int kind ;
929- const void * data ;
965+ Py_ssize_t pos , linelen , chunk_end , p ;
930966 PyObject * lineobj ;
967+ DialectObj * dialect ;
931968
932969 _csvstate * module_state = _csv_state_from_type (Py_TYPE (self ),
933970 "Reader.__next__" );
@@ -937,13 +974,16 @@ Reader_iternext(PyObject *op)
937974
938975 if (parse_reset (self ) < 0 )
939976 return NULL ;
977+
978+ dialect = self -> dialect ;
979+
940980 do {
941981 lineobj = PyIter_Next (self -> input_iter );
942982 if (lineobj == NULL ) {
943983 /* End of input OR exception */
944984 if (!PyErr_Occurred () && (self -> field_len != 0 ||
945985 self -> state == IN_QUOTED_FIELD )) {
946- if (self -> dialect -> strict )
986+ if (dialect -> strict )
947987 PyErr_SetString (module_state -> error_obj ,
948988 "unexpected end of data" );
949989 else if (parse_save_field (self ) >= 0 )
@@ -962,17 +1002,109 @@ Reader_iternext(PyObject *op)
9621002 return NULL ;
9631003 }
9641004 ++ self -> line_num ;
965- kind = PyUnicode_KIND (lineobj );
966- data = PyUnicode_DATA (lineobj );
967- pos = 0 ;
1005+
9681006 linelen = PyUnicode_GET_LENGTH (lineobj );
969- while (linelen -- ) {
970- c = PyUnicode_READ (kind , data , pos );
971- if (parse_process_char (self , module_state , c ) < 0 ) {
972- Py_DECREF (lineobj );
973- goto err ;
1007+ pos = 0 ;
1008+
1009+ while (pos < linelen ) {
1010+ switch (self -> state ) {
1011+ case IN_FIELD :
1012+ chunk_end = linelen ;
1013+
1014+ p = PyUnicode_FindChar (lineobj , dialect -> delimiter , pos , linelen , 1 );
1015+ if (p >= 0 && p < chunk_end ) {
1016+ chunk_end = p ;
1017+ } else if (p == -2 ) {
1018+ Py_DECREF (lineobj );
1019+ goto err ;
1020+ }
1021+ if (dialect -> escapechar != NOT_SET ) {
1022+ p = PyUnicode_FindChar (lineobj , dialect -> escapechar , pos , linelen , 1 );
1023+ if (p >= 0 && p < chunk_end ) {
1024+ chunk_end = p ;
1025+ } else if (p == -2 ) {
1026+ Py_DECREF (lineobj );
1027+ goto err ;
1028+ }
1029+ }
1030+ p = PyUnicode_FindChar (lineobj , '\n' , pos , linelen , 1 );
1031+ if (p >= 0 && p < chunk_end ) {
1032+ chunk_end = p ;
1033+ } else if (p == -2 ) {
1034+ Py_DECREF (lineobj );
1035+ goto err ;
1036+ }
1037+ p = PyUnicode_FindChar (lineobj , '\r' , pos , linelen , 1 );
1038+ if (p >= 0 && p < chunk_end ) {
1039+ chunk_end = p ;
1040+ } else if (p == -2 ) {
1041+ Py_DECREF (lineobj );
1042+ goto err ;
1043+ }
1044+
1045+ if (chunk_end > pos ) {
1046+ if (parse_add_substring (self , module_state , lineobj , pos , chunk_end ) < 0 ) {
1047+ Py_DECREF (lineobj );
1048+ goto err ;
1049+ }
1050+ }
1051+ pos = chunk_end ;
1052+
1053+ if (pos < linelen ) {
1054+ Py_UCS4 c = PyUnicode_READ_CHAR (lineobj , pos );
1055+ if (parse_process_char (self , module_state , c ) < 0 ) {
1056+ Py_DECREF (lineobj );
1057+ goto err ;
1058+ }
1059+ pos ++ ;
1060+ }
1061+ break ;
1062+ case IN_QUOTED_FIELD :
1063+ chunk_end = linelen ;
1064+
1065+ p = PyUnicode_FindChar (lineobj , dialect -> quotechar , pos , linelen , 1 );
1066+ if (p >= 0 && p < chunk_end ) {
1067+ chunk_end = p ;
1068+ } else if (p == -2 ) {
1069+ Py_DECREF (lineobj );
1070+ goto err ;
1071+ }
1072+ if (dialect -> escapechar != NOT_SET ) {
1073+ p = PyUnicode_FindChar (lineobj , dialect -> escapechar , pos , linelen , 1 );
1074+ if (p >= 0 && p < chunk_end ) {
1075+ chunk_end = p ;
1076+ } else if (p == -2 ) {
1077+ Py_DECREF (lineobj );
1078+ goto err ;
1079+ }
1080+ }
1081+
1082+ if (chunk_end > pos ) {
1083+ if (parse_add_substring (self , module_state , lineobj , pos , chunk_end ) < 0 ) {
1084+ Py_DECREF (lineobj );
1085+ goto err ;
1086+ }
1087+ }
1088+ pos = chunk_end ;
1089+
1090+ if (pos < linelen ) {
1091+ Py_UCS4 c = PyUnicode_READ_CHAR (lineobj , pos );
1092+ if (parse_process_char (self , module_state , c ) < 0 ) {
1093+ Py_DECREF (lineobj );
1094+ goto err ;
1095+ }
1096+ pos ++ ;
1097+ }
1098+ break ;
1099+ default :
1100+ Py_UCS4 c = PyUnicode_READ_CHAR (lineobj , pos );
1101+ if (parse_process_char (self , module_state , c ) < 0 ) {
1102+ Py_DECREF (lineobj );
1103+ goto err ;
1104+ }
1105+ pos ++ ;
1106+ break ;
9741107 }
975- pos ++ ;
9761108 }
9771109 Py_DECREF (lineobj );
9781110 if (parse_process_char (self , module_state , EOL ) < 0 )
0 commit comments