For those in a hurry, here's a quick code snippet for a whitespace tokenizer in Rust:
// file: whitespace_tokenizer.rs
use std::env;
use std::process;
use std::fs::File;
use std::io::BufReader;
use std::io::Read;
fn main() {
let args: Vec<String> = env::args().collect();
if args.len() != 2 {
println!("Usage: {:?} file_name.txt", args[0]);
process::exit(1);
}
// Access file
let file_name: String = args[1].clone();
let f: File = match File::open(file_name) {
Ok(file) => file,
Err(err) => {
println!("Err: opening file: {:?}", err);
process::exit(1);
},
};
// Extract token that separated by whitespace from file
let reader: BufReader<File> = BufReader::new(f);
const MAX_BYTES: usize = 256;
let mut bytes: Vec<u8> = Vec::with_capacity(MAX_BYTES);
for res_byte in reader.bytes() {
let byte: u8 = match res_byte {
Ok(b) => b,
Err(err) => {
println!("Err: read bytes: {:?}", err);
process::exit(1);
},
};
// UTF-8 range
// 0 - 127 => single-byte character
// 128 - 191 => Continuation bytes
// 194 - 244 => leading bytes
// 192,193,245-255 => invalid utf-8
match byte {
// Handle single-byte character
byte if (byte <= 127) => {
let c: char = byte as char;
if !c.is_whitespace() {
if bytes.len() == MAX_BYTES {
bytes.reserve(MAX_BYTES);
};
bytes.push(byte);
} else {
// skip if bytes is empty
if bytes.len() == 0 { continue; };
let token: String;
token = match String::from_utf8(bytes.clone()) {
Ok(s) => s,
Err(err) => {
println!("Err: convert bytes to String: {:?}",
err);
bytes.clear();
continue;
},
};
println!("token: {:?}", token);
bytes.clear();
}
}
// Just push the leading and continuation bytes to bytes buffer
_ => {
bytes.push(byte);
}
}
}
}
Read on for detailed explanations and how to compile and use this program.
Understanding the Problem
When dealing with UTF-8 encoded files in Rust, it's important to understand that some characters are represented by multiple bytes. For example, the character Å
is composed of two bytes 195 133
. Mistaking a multi-byte character for a whitespace can lead to incorrect tokenization.
Useful UTF-8 Byte-Ranges
Here's a handy reference for UTF-8 byte-ranges:
// 0 - 127 => Single-byte character
// 128 - 191 => Continuation bytes
// 194 - 244 => Leading bytes
// 192,193,245-255 => Invalid utf-8
Only split the token if the whitespace is a single-byte character.
Building the Whitespace Tokenizer
Here's a detailed look at the whitespace tokenizer in Rust:
// file: whitespace_tokenizer.rs
// file: whitespace_tokenizer.rs
use std::env;
use std::process;
use std::fs::File;
use std::io::BufReader;
use std::io::Read;
fn main() {
let args: Vec<String> = env::args().collect();
if args.len() != 2 {
println!("Usage: {:?} file_name.txt", args[0]);
process::exit(1);
}
// Access file
let file_name: String = args[1].clone();
let f: File = match File::open(file_name) {
Ok(file) => file,
Err(err) => {
println!("Err: opening file: {:?}", err);
process::exit(1);
},
};
// Extract token that separated by whitespace from file
let reader: BufReader<File> = BufReader::new(f);
const MAX_BYTES: usize = 256;
let mut bytes: Vec<u8> = Vec::with_capacity(MAX_BYTES);
for res_byte in reader.bytes() {
let byte: u8 = match res_byte {
Ok(b) => b,
Err(err) => {
println!("Err: read bytes: {:?}", err);
process::exit(1);
},
};
// UTF-8 range
// 0 - 127 => single-byte character
// 128 - 191 => Continuation bytes
// 194 - 244 => leading bytes
// 192,193,245-255 => invalid utf-8
match byte {
// Handle single-byte character
byte if (byte <= 127) => {
let c: char = byte as char;
if !c.is_whitespace() {
if bytes.len() == MAX_BYTES {
bytes.reserve(MAX_BYTES);
};
bytes.push(byte);
} else {
// skip if bytes is empty
if bytes.len() == 0 { continue; };
let token: String;
token = match String::from_utf8(bytes.clone()) {
Ok(s) => s,
Err(err) => {
println!("Err: convert bytes to String: {:?}",
err);
bytes.clear();
continue;
},
};
println!("token: {:?}", token);
bytes.clear();
}
}
// Just push the leading and continuation bytes to bytes buffer
_ => {
bytes.push(byte);
}
}
}
}
This program reads a file, processes its bytes, and identifies tokens separated by whitespace, considering UTF-8 encoding.
Compiling and Testing
Compile the Code:
rustc whitespace_tokenizer.rs
Run the Tokenizer:
./whitespace_tokenizer file_name.txt
Conclusion
Congratulations! You now have a Rust program that can tokenize text based on whitespace while handling UTF-8 encoding with care. Happy coding and exploring the world of text processing in Rust!