@@ -103,9 +103,9 @@ public LoCInfo extract(TextualExtractor textualExtractor) {
103103 try {
104104 parser = new ParserImpl (new StreamReader (textualExtractor .getSource ()));
105105 resolver = new Resolver ();
106-
107106 int idx = 0 ;
108- while (!atStreamEnd ()) extractDocument (fileLabel , idx ++);
107+ while (!atStreamEnd ())
108+ extractDocument (fileLabel , idx ++, textualExtractor .getSource ().codePoints ().toArray ());
109109 } catch (MarkedYAMLException e ) {
110110 int line = e .getProblemMark ().getLine () + 1 ;
111111 int column = e .getProblemMark ().getColumn () + 1 ;
@@ -136,16 +136,16 @@ private boolean atStreamEnd() {
136136 }
137137
138138 /** Extract a complete YAML document; cf. {@link Composer#composeDocument}. */
139- private void extractDocument (Label parent , int idx ) {
139+ private void extractDocument (Label parent , int idx , int [] codepoints ) {
140140 // Drop the DOCUMENT-START event
141141 parser .getEvent ();
142- extractNode (parent , idx );
142+ extractNode (parent , idx , codepoints );
143143 // Drop the DOCUMENT-END event
144144 parser .getEvent ();
145145 }
146146
147147 /** Extract a single YAML node; cf. {@link Composer#composeNode}. */
148- private void extractNode (Label parent , int idx ) {
148+ private void extractNode (Label parent , int idx , int [] codepoints ) {
149149 Label label = trapWriter .freshLabel ();
150150 NodeKind kind ;
151151 String tag = "" ;
@@ -169,15 +169,14 @@ private void extractNode(Label parent, int idx) {
169169 scalar .getImplicit ().canOmitTagInPlainScalar ());
170170 Character style = scalar .getStyle ();
171171 int styleCode = style == null ? 0 : (int ) style ;
172- trapWriter .addTuple (
173- YAMLTables .YAML_SCALARS , label , styleCode , scalar .getValue ());
172+ trapWriter .addTuple (YAMLTables .YAML_SCALARS , label , styleCode , scalar .getValue ());
174173 } else if (start .is (Event .ID .SequenceStart )) {
175174 kind = NodeKind .SEQUENCE ;
176175 SequenceStartEvent sequenceStart = (SequenceStartEvent ) start ;
177176 tag = getTag (sequenceStart .getTag (), NodeId .sequence , null , sequenceStart .getImplicit ());
178177
179178 int childIdx = 0 ;
180- while (!parser .checkEvent (Event .ID .SequenceEnd )) extractNode (label , childIdx ++);
179+ while (!parser .checkEvent (Event .ID .SequenceEnd )) extractNode (label , childIdx ++, codepoints );
181180
182181 end = parser .getEvent ();
183182 } else if (start .is (Event .ID .MappingStart )) {
@@ -187,8 +186,8 @@ private void extractNode(Label parent, int idx) {
187186
188187 int childIdx = 1 ;
189188 while (!parser .checkEvent (Event .ID .MappingEnd )) {
190- extractNode (label , childIdx );
191- extractNode (label , -childIdx );
189+ extractNode (label , childIdx , codepoints );
190+ extractNode (label , -childIdx , codepoints );
192191 ++childIdx ;
193192 }
194193
@@ -205,7 +204,7 @@ private void extractNode(Label parent, int idx) {
205204 parent ,
206205 idx ,
207206 tag ,
208- mkToString (start .getStartMark (), end .getEndMark ()));
207+ mkToString (start .getStartMark (), end .getEndMark (), codepoints ));
209208 extractLocation (label , start .getStartMark (), end .getEndMark ());
210209 }
211210
@@ -216,33 +215,30 @@ private String getTag(String explicitTag, NodeId kind, String value, boolean imp
216215 return explicitTag ;
217216 }
218217
218+ private static boolean isNewLine (int codePoint ) {
219+ switch (codePoint ) {
220+ case '\n' :
221+ case '\r' :
222+ case '\u0085' :
223+ case '\u2028' :
224+ case '\u2029' :
225+ return true ;
226+ default :
227+ return false ;
228+ }
229+ }
230+
219231 /**
220- * SnakeYAML doesn't directly expose the source text of nodes, but we can get a decent
221- * approximation from the snippet associated with the node's start {@linkplain Mark}.
222- *
223- * <p>The snippet of a {@linkplain Mark} is meant to be used for diagnostic messages and consists
224- * of two lines: the first line contains some context around the source position represented by
225- * the mark, the second line contains a caret character positioned underneath the source position
226- * itself.
227- *
228- * <p>To approximate the source text, we take the text on the first line and strip off the first
229- * <i>n</i> characters, where <i>n</i> is the number of spaces preceding the caret character on
230- * the second line.
231- *
232- * <p>This is only an approximation, since the context is limited to relatively short strings that
233- * never extend across newlines, but it suffices for the purposes of <code>toString</code>.
232+ * SnakeYAML doesn't directly expose the source text of nodes, but we also take the file contents
233+ * as an array of Unicode code points. The start and end marks each contain an index into the code
234+ * point stream (the end is exclusive), so we can reconstruct the snippet. For readability, we
235+ * stop at the first encountered newline.
234236 */
235- private String mkToString (Mark startMark , Mark endMark ) {
236- String snippet = startMark .get_snippet (0 , Integer .MAX_VALUE );
237- int nl = snippet .indexOf ('\n' );
238- String context = snippet .substring (0 , nl );
239- String src = context .substring (snippet .substring (nl + 1 ).indexOf ('^' ));
240- int desiredStringLength = endMark .getColumn () - startMark .getColumn ();
241- boolean hasAccessToDesiredString = src .length () >= desiredStringLength ;
242- boolean isSingleLine = endMark .getLine () == startMark .getLine ();
243- if (isSingleLine && hasAccessToDesiredString )
244- src = src .substring (0 , desiredStringLength );
245- return TextualExtractor .sanitiseToString (src );
237+ private static String mkToString (Mark startMark , Mark endMark , int [] codepoints ) {
238+ StringBuilder b = new StringBuilder ();
239+ for (int i = startMark .getIndex (); i < endMark .getIndex () && !isNewLine (codepoints [i ]); i ++)
240+ b .appendCodePoint (codepoints [i ]);
241+ return TextualExtractor .sanitiseToString (b .toString ());
246242 }
247243
248244 /** Emit a source location for a YAML node. */
0 commit comments