Skip to content

Commit 21e1ef1

Browse files
authored
Merge pull request #248 from gcanat/wc_chars
[wc] faster character count
2 parents 441b889 + 98b7fc9 commit 21e1ef1

File tree

1 file changed

+16
-54
lines changed

1 file changed

+16
-54
lines changed

text/wc.rs

Lines changed: 16 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ use clap::Parser;
1111
use gettextrs::{bind_textdomain_codeset, setlocale, textdomain, LocaleCategory};
1212
use plib::PROJECT_NAME;
1313
use std::ffi::OsStr;
14-
use std::io::{self, BufRead, Read};
14+
use std::io::{self, Read};
1515
use std::path::PathBuf;
1616

1717
/// wc - word, line, and byte or character count
@@ -42,7 +42,6 @@ struct CountInfo {
4242
words: usize,
4343
chars: usize,
4444
nl: usize,
45-
was_space: bool,
4645
}
4746

4847
impl CountInfo {
@@ -51,7 +50,6 @@ impl CountInfo {
5150
words: 0,
5251
chars: 0,
5352
nl: 0,
54-
was_space: true,
5553
}
5654
}
5755

@@ -124,67 +122,36 @@ fn build_display_str(args: &Args, count: &CountInfo, filename: &OsStr) -> String
124122
output
125123
}
126124

127-
fn wc_file_bytes(count: &mut CountInfo, pathname: &PathBuf, table: &[bool; 256]) -> io::Result<()> {
125+
fn wc_file_bytes(count: &mut CountInfo, pathname: &PathBuf, chars_mode: bool) -> io::Result<()> {
128126
let mut file = plib::io::input_stream(pathname, false)?;
129127

130128
let mut buffer = [0; plib::BUFSZ];
131-
let mut was_space = count.was_space;
129+
let mut was_space = true;
132130

133131
loop {
134132
let n_read = file.read(&mut buffer[..])?;
135133
if n_read == 0 {
136134
break;
137135
}
138136

139-
count.chars = count.chars + n_read;
140-
141137
let bufslice = &buffer[0..n_read];
142138

139+
if !chars_mode {
140+
// number of bytes read
141+
count.chars = count.chars + n_read;
142+
} else {
143+
// number of UTF-8 unicode codepoints in this slice of bytes
144+
count.chars += bufslice.iter().filter(|&ch| (ch >> 6) != 0b10).count();
145+
}
146+
147+
143148
for ch_u8 in bufslice {
144-
let is_space = table[*ch_u8 as usize];
149+
let is_space = BYTE_TABLE[*ch_u8 as usize];
145150
count.nl += (ch_u8 == &10) as usize;
146151
count.words += (!is_space && was_space) as usize;
147152
was_space = is_space;
148153
}
149154
}
150-
count.was_space = was_space;
151-
152-
Ok(())
153-
}
154-
155-
fn wc_file_chars(args: &Args, count: &mut CountInfo, pathname: &PathBuf) -> io::Result<()> {
156-
let mut reader = plib::io::input_reader(pathname, false)?;
157-
158-
loop {
159-
let mut buffer = String::new();
160-
let n_read = reader.read_line(&mut buffer)?;
161-
if n_read == 0 {
162-
break;
163-
}
164-
165-
count.nl = count.nl + 1;
166-
count.chars = count.chars + n_read;
167-
168-
if args.words {
169-
let mut in_word = false;
170-
171-
for ch in buffer.chars() {
172-
if ch.is_whitespace() {
173-
if in_word {
174-
in_word = false;
175-
count.words = count.words + 1;
176-
}
177-
} else {
178-
if !in_word {
179-
in_word = true;
180-
}
181-
}
182-
}
183-
if in_word {
184-
count.words = count.words + 1;
185-
}
186-
}
187-
}
188155

189156
Ok(())
190157
}
@@ -194,13 +161,8 @@ fn wc_file(
194161
chars_mode: bool,
195162
pathname: &PathBuf,
196163
count: &mut CountInfo,
197-
table: &[bool; 256],
198164
) -> io::Result<()> {
199-
if chars_mode {
200-
wc_file_chars(args, count, pathname)?;
201-
} else {
202-
wc_file_bytes(count, pathname, table)?;
203-
}
165+
wc_file_bytes(count, pathname, chars_mode)?;
204166

205167
let output = build_display_str(args, count, pathname.as_os_str());
206168

@@ -236,7 +198,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
236198
if args.files.is_empty() {
237199
let mut count = CountInfo::new();
238200

239-
if let Err(e) = wc_file(&args, chars_mode, &PathBuf::new(), &mut count, &BYTE_TABLE) {
201+
if let Err(e) = wc_file(&args, chars_mode, &PathBuf::new(), &mut count) {
240202
exit_code = 1;
241203
eprintln!("stdin: {}", e);
242204
}
@@ -246,7 +208,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
246208
for filename in &args.files {
247209
let mut count = CountInfo::new();
248210

249-
if let Err(e) = wc_file(&args, chars_mode, filename, &mut count, &BYTE_TABLE) {
211+
if let Err(e) = wc_file(&args, chars_mode, filename, &mut count) {
250212
exit_code = 1;
251213
eprintln!("{}: {}", filename.display(), e);
252214
}

0 commit comments

Comments
 (0)