@@ -11,7 +11,7 @@ use clap::Parser;
1111use gettextrs:: { bind_textdomain_codeset, setlocale, textdomain, LocaleCategory } ;
1212use plib:: PROJECT_NAME ;
1313use std:: ffi:: OsStr ;
14- use std:: io:: { self , BufRead , Read } ;
14+ use std:: io:: { self , Read } ;
1515use std:: path:: PathBuf ;
1616
1717/// wc - word, line, and byte or character count
@@ -42,7 +42,6 @@ struct CountInfo {
4242 words : usize ,
4343 chars : usize ,
4444 nl : usize ,
45- was_space : bool ,
4645}
4746
4847impl CountInfo {
@@ -51,7 +50,6 @@ impl CountInfo {
5150 words : 0 ,
5251 chars : 0 ,
5352 nl : 0 ,
54- was_space : true ,
5553 }
5654 }
5755
@@ -124,67 +122,36 @@ fn build_display_str(args: &Args, count: &CountInfo, filename: &OsStr) -> String
124122 output
125123}
126124
127- fn wc_file_bytes ( count : & mut CountInfo , pathname : & PathBuf , table : & [ bool ; 256 ] ) -> io:: Result < ( ) > {
125+ fn wc_file_bytes ( count : & mut CountInfo , pathname : & PathBuf , chars_mode : bool ) -> io:: Result < ( ) > {
128126 let mut file = plib:: io:: input_stream ( pathname, false ) ?;
129127
130128 let mut buffer = [ 0 ; plib:: BUFSZ ] ;
131- let mut was_space = count . was_space ;
129+ let mut was_space = true ;
132130
133131 loop {
134132 let n_read = file. read ( & mut buffer[ ..] ) ?;
135133 if n_read == 0 {
136134 break ;
137135 }
138136
139- count. chars = count. chars + n_read;
140-
141137 let bufslice = & buffer[ 0 ..n_read] ;
142138
139+ if !chars_mode {
140+ // number of bytes read
141+ count. chars = count. chars + n_read;
142+ } else {
143+ // number of UTF-8 unicode codepoints in this slice of bytes
144+ count. chars += bufslice. iter ( ) . filter ( |& ch| ( ch >> 6 ) != 0b10 ) . count ( ) ;
145+ }
146+
147+
143148 for ch_u8 in bufslice {
144- let is_space = table [ * ch_u8 as usize ] ;
149+ let is_space = BYTE_TABLE [ * ch_u8 as usize ] ;
145150 count. nl += ( ch_u8 == & 10 ) as usize ;
146151 count. words += ( !is_space && was_space) as usize ;
147152 was_space = is_space;
148153 }
149154 }
150- count. was_space = was_space;
151-
152- Ok ( ( ) )
153- }
154-
155- fn wc_file_chars ( args : & Args , count : & mut CountInfo , pathname : & PathBuf ) -> io:: Result < ( ) > {
156- let mut reader = plib:: io:: input_reader ( pathname, false ) ?;
157-
158- loop {
159- let mut buffer = String :: new ( ) ;
160- let n_read = reader. read_line ( & mut buffer) ?;
161- if n_read == 0 {
162- break ;
163- }
164-
165- count. nl = count. nl + 1 ;
166- count. chars = count. chars + n_read;
167-
168- if args. words {
169- let mut in_word = false ;
170-
171- for ch in buffer. chars ( ) {
172- if ch. is_whitespace ( ) {
173- if in_word {
174- in_word = false ;
175- count. words = count. words + 1 ;
176- }
177- } else {
178- if !in_word {
179- in_word = true ;
180- }
181- }
182- }
183- if in_word {
184- count. words = count. words + 1 ;
185- }
186- }
187- }
188155
189156 Ok ( ( ) )
190157}
@@ -194,13 +161,8 @@ fn wc_file(
194161 chars_mode : bool ,
195162 pathname : & PathBuf ,
196163 count : & mut CountInfo ,
197- table : & [ bool ; 256 ] ,
198164) -> io:: Result < ( ) > {
199- if chars_mode {
200- wc_file_chars ( args, count, pathname) ?;
201- } else {
202- wc_file_bytes ( count, pathname, table) ?;
203- }
165+ wc_file_bytes ( count, pathname, chars_mode) ?;
204166
205167 let output = build_display_str ( args, count, pathname. as_os_str ( ) ) ;
206168
@@ -236,7 +198,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
236198 if args. files . is_empty ( ) {
237199 let mut count = CountInfo :: new ( ) ;
238200
239- if let Err ( e) = wc_file ( & args, chars_mode, & PathBuf :: new ( ) , & mut count, & BYTE_TABLE ) {
201+ if let Err ( e) = wc_file ( & args, chars_mode, & PathBuf :: new ( ) , & mut count) {
240202 exit_code = 1 ;
241203 eprintln ! ( "stdin: {}" , e) ;
242204 }
@@ -246,7 +208,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
246208 for filename in & args. files {
247209 let mut count = CountInfo :: new ( ) ;
248210
249- if let Err ( e) = wc_file ( & args, chars_mode, filename, & mut count, & BYTE_TABLE ) {
211+ if let Err ( e) = wc_file ( & args, chars_mode, filename, & mut count) {
250212 exit_code = 1 ;
251213 eprintln ! ( "{}: {}" , filename. display( ) , e) ;
252214 }
0 commit comments