@@ -11,7 +11,7 @@ use clap::Parser;
1111use gettextrs:: { bind_textdomain_codeset, setlocale, textdomain, LocaleCategory } ;
1212use plib:: PROJECT_NAME ;
1313use std:: ffi:: OsStr ;
14- use std:: io:: { self , BufRead , Read } ;
14+ use std:: io:: { self , Read } ;
1515use std:: path:: PathBuf ;
1616
1717/// wc - word, line, and byte or character count
@@ -122,7 +122,7 @@ fn build_display_str(args: &Args, count: &CountInfo, filename: &OsStr) -> String
122122 output
123123}
124124
125- fn wc_file_bytes ( count : & mut CountInfo , pathname : & PathBuf , table : & [ bool ; 256 ] ) -> io:: Result < ( ) > {
125+ fn wc_file_bytes ( count : & mut CountInfo , pathname : & PathBuf , table : & [ bool ; 256 ] , chars_mode : bool ) -> io:: Result < ( ) > {
126126 let mut file = plib:: io:: input_stream ( pathname, false ) ?;
127127
128128 let mut buffer = [ 0 ; plib:: BUFSZ ] ;
@@ -134,10 +134,17 @@ fn wc_file_bytes(count: &mut CountInfo, pathname: &PathBuf, table: &[bool; 256])
134134 break ;
135135 }
136136
137- count. chars = count. chars + n_read;
138-
139137 let bufslice = & buffer[ 0 ..n_read] ;
140138
139+ if !chars_mode {
140+ // number of bytes read
141+ count. chars = count. chars + n_read;
142+ } else {
143+ // number of UTF-8 unicode codepoints in this slice of bytes
144+ count. chars += bufslice. iter ( ) . filter ( |& ch| ( ch >> 6 ) != 0b10 ) . count ( ) ;
145+ }
146+
147+
141148 for ch_u8 in bufslice {
142149 let is_space = table[ * ch_u8 as usize ] ;
143150 count. nl += ( ch_u8 == & 10 ) as usize ;
@@ -149,55 +156,14 @@ fn wc_file_bytes(count: &mut CountInfo, pathname: &PathBuf, table: &[bool; 256])
149156 Ok ( ( ) )
150157}
151158
152- fn wc_file_chars ( args : & Args , count : & mut CountInfo , pathname : & PathBuf ) -> io:: Result < ( ) > {
153- let mut reader = plib:: io:: input_reader ( pathname, false ) ?;
154-
155- loop {
156- let mut buffer = String :: new ( ) ;
157- let n_read = reader. read_line ( & mut buffer) ?;
158- if n_read == 0 {
159- break ;
160- }
161-
162- count. nl = count. nl + 1 ;
163- count. chars = count. chars + n_read;
164-
165- if args. words {
166- let mut in_word = false ;
167-
168- for ch in buffer. chars ( ) {
169- if ch. is_whitespace ( ) {
170- if in_word {
171- in_word = false ;
172- count. words = count. words + 1 ;
173- }
174- } else {
175- if !in_word {
176- in_word = true ;
177- }
178- }
179- }
180- if in_word {
181- count. words = count. words + 1 ;
182- }
183- }
184- }
185-
186- Ok ( ( ) )
187- }
188-
189159fn wc_file (
190160 args : & Args ,
191161 chars_mode : bool ,
192162 pathname : & PathBuf ,
193163 count : & mut CountInfo ,
194164 table : & [ bool ; 256 ] ,
195165) -> io:: Result < ( ) > {
196- if chars_mode {
197- wc_file_chars ( args, count, pathname) ?;
198- } else {
199- wc_file_bytes ( count, pathname, table) ?;
200- }
166+ wc_file_bytes ( count, pathname, table, chars_mode) ?;
201167
202168 let output = build_display_str ( args, count, pathname. as_os_str ( ) ) ;
203169
0 commit comments