Merge pull request #133 from Jonas-Heinrich/master Modify benchmarks to compare against stdlib functions

commit: 3ff9de65671d35753f5ad273b6bb4da028f5ebb4 [log] [tgz]
author: Manish Goregaokar <manishsmail@gmail.com> Tue Apr 16 15:50:35 2024
committer: GitHub <noreply@github.com> Tue Apr 16 15:50:35 2024
tree: 9e7617d2559e7fefae21aaed7765e786bad7f5dd
parent: baab923fa5d1bfbe24d3ea825d625300f700f7a0 [diff]
parent: d6a6eb848214135095e9034c24de748c4dfcbe53 [diff]
diff --git a/Cargo.toml b/Cargo.toml
index dda0abf..a8d25db 100644
--- a/Cargo.toml
+++ b/Cargo.toml

@@ -23,17 +23,16 @@
 
 [dev-dependencies]
 quickcheck = "0.7"
-criterion = "0.3"
+criterion = "0.5"
 
 [[bench]]
-name = "graphemes"
+name = "chars"
 harness = false
 
 [[bench]]
-name = "unicode_words"
+name = "words"
 harness = false
 
 [[bench]]
 name = "word_bounds"
 harness = false
-

diff --git a/benches/chars.rs b/benches/chars.rs
new file mode 100644
index 0000000..d8dc5ea
--- /dev/null
+++ b/benches/chars.rs

@@ -0,0 +1,60 @@
+//! Compares the performance of `UnicodeSegmentation::graphemes` with stdlib's UTF-8 scalar-based
+//! `std::str::chars`.
+//!
+//! It is expected that `std::str::chars` is faster than `UnicodeSegmentation::graphemes` since it
+//! does not consider the complexity of grapheme clusters. The question in this benchmark
+//! is how much slower full unicode handling is.
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+use unicode_segmentation;
+
+use std::fs;
+use unicode_segmentation::UnicodeSegmentation;
+
+const FILES: &[&str] = &[
+    "arabic",
+    "english",
+    "hindi",
+    "japanese",
+    "korean",
+    "mandarin",
+    "russian",
+    "source_code",
+];
+
+#[inline(always)]
+fn grapheme(text: &str) {
+    for c in UnicodeSegmentation::graphemes(black_box(&*text), true) {
+        black_box(c);
+    }
+}
+
+#[inline(always)]
+fn scalar(text: &str) {
+    for c in black_box(&*text).chars() {
+        black_box(c);
+    }
+}
+
+fn bench_all(c: &mut Criterion) {
+    let mut group = c.benchmark_group("chars");
+
+    for file in FILES {
+        group.bench_with_input(
+            BenchmarkId::new("grapheme", file),
+            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            |b, content| b.iter(|| grapheme(content)),
+        );
+    }
+
+    for file in FILES {
+        group.bench_with_input(
+            BenchmarkId::new("scalar", file),
+            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            |b, content| b.iter(|| scalar(content)),
+        );
+    }
+}
+
+criterion_group!(benches, bench_all);
+criterion_main!(benches);

diff --git a/benches/graphemes.rs b/benches/graphemes.rs
deleted file mode 100644
index 3a0b9b7..0000000
--- a/benches/graphemes.rs
+++ /dev/null

@@ -1,63 +0,0 @@
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use unicode_segmentation;
-
-use std::fs;
-use unicode_segmentation::UnicodeSegmentation;
-
-fn graphemes(c: &mut Criterion, lang: &str, path: &str) {
-    let text = fs::read_to_string(path).unwrap();
-
-    c.bench_function(&format!("graphemes_{}", lang), |bench| {
-        bench.iter(|| {
-            for g in UnicodeSegmentation::graphemes(black_box(&*text), true) {
-                black_box(g);
-            }
-        })
-    });
-}
-
-fn graphemes_arabic(c: &mut Criterion) {
-    graphemes(c, "arabic", "benches/texts/arabic.txt");
-}
-
-fn graphemes_english(c: &mut Criterion) {
-    graphemes(c, "english", "benches/texts/english.txt");
-}
-
-fn graphemes_hindi(c: &mut Criterion) {
-    graphemes(c, "hindi", "benches/texts/hindi.txt");
-}
-
-fn graphemes_japanese(c: &mut Criterion) {
-    graphemes(c, "japanese", "benches/texts/japanese.txt");
-}
-
-fn graphemes_korean(c: &mut Criterion) {
-    graphemes(c, "korean", "benches/texts/korean.txt");
-}
-
-fn graphemes_mandarin(c: &mut Criterion) {
-    graphemes(c, "mandarin", "benches/texts/mandarin.txt");
-}
-
-fn graphemes_russian(c: &mut Criterion) {
-    graphemes(c, "russian", "benches/texts/russian.txt");
-}
-
-fn graphemes_source_code(c: &mut Criterion) {
-    graphemes(c, "source_code", "benches/texts/source_code.txt");
-}
-
-criterion_group!(
-    benches,
-    graphemes_arabic,
-    graphemes_english,
-    graphemes_hindi,
-    graphemes_japanese,
-    graphemes_korean,
-    graphemes_mandarin,
-    graphemes_russian,
-    graphemes_source_code,
-);
-
-criterion_main!(benches);

diff --git a/benches/unicode_words.rs b/benches/unicode_words.rs
deleted file mode 100644
index a7f8f41..0000000
--- a/benches/unicode_words.rs
+++ /dev/null

@@ -1,61 +0,0 @@
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-
-use std::fs;
-use unicode_segmentation::UnicodeSegmentation;
-
-fn unicode_words(c: &mut Criterion, lang: &str, path: &str) {
-    let text = fs::read_to_string(path).unwrap();
-    c.bench_function(&format!("unicode_words_{}", lang), |bench| {
-        bench.iter(|| {
-            for w in text.unicode_words() {
-                black_box(w);
-            }
-        })
-    });
-}
-
-fn unicode_words_arabic(c: &mut Criterion) {
-    unicode_words(c, "arabic", "benches/texts/arabic.txt");
-}
-
-fn unicode_words_english(c: &mut Criterion) {
-    unicode_words(c, "english", "benches/texts/english.txt");
-}
-
-fn unicode_words_hindi(c: &mut Criterion) {
-    unicode_words(c, "hindi", "benches/texts/hindi.txt");
-}
-
-fn unicode_words_japanese(c: &mut Criterion) {
-    unicode_words(c, "japanese", "benches/texts/japanese.txt");
-}
-
-fn unicode_words_korean(c: &mut Criterion) {
-    unicode_words(c, "korean", "benches/texts/korean.txt");
-}
-
-fn unicode_words_mandarin(c: &mut Criterion) {
-    unicode_words(c, "mandarin", "benches/texts/mandarin.txt");
-}
-
-fn unicode_words_russian(c: &mut Criterion) {
-    unicode_words(c, "russian", "benches/texts/russian.txt");
-}
-
-fn unicode_words_source_code(c: &mut Criterion) {
-    unicode_words(c, "source_code", "benches/texts/source_code.txt");
-}
-
-criterion_group!(
-    benches,
-    unicode_words_arabic,
-    unicode_words_english,
-    unicode_words_hindi,
-    unicode_words_japanese,
-    unicode_words_korean,
-    unicode_words_mandarin,
-    unicode_words_russian,
-    unicode_words_source_code,
-);
-
-criterion_main!(benches);

diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs
index cae7a88..42d50ff 100644
--- a/benches/word_bounds.rs
+++ b/benches/word_bounds.rs

@@ -1,61 +1,37 @@
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
 
 use std::fs;
 use unicode_segmentation::UnicodeSegmentation;
 
-fn word_bounds(c: &mut Criterion, lang: &str, path: &str) {
-    let text = fs::read_to_string(path).unwrap();
-    c.bench_function(&format!("word_bounds_{}", lang), |bench| {
-        bench.iter(|| {
-            for w in text.split_word_bounds() {
-                black_box(w);
-            }
-        });
-    });
+const FILES: &[&str] = &[
+    "arabic",
+    "english",
+    "hindi",
+    "japanese",
+    "korean",
+    "mandarin",
+    "russian",
+    "source_code",
+];
+
+#[inline(always)]
+fn grapheme(text: &str) {
+    for w in text.split_word_bounds() {
+        black_box(w);
+    }
 }
 
-fn word_bounds_arabic(c: &mut Criterion) {
-    word_bounds(c, "arabic", "benches/texts/arabic.txt");
+fn bench_all(c: &mut Criterion) {
+    let mut group = c.benchmark_group("word_bounds");
+
+    for file in FILES {
+        group.bench_with_input(
+            BenchmarkId::new("grapheme", file),
+            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            |b, content| b.iter(|| grapheme(content)),
+        );
+    }
 }
 
-fn word_bounds_english(c: &mut Criterion) {
-    word_bounds(c, "english", "benches/texts/english.txt");
-}
-
-fn word_bounds_hindi(c: &mut Criterion) {
-    word_bounds(c, "hindi", "benches/texts/hindi.txt");
-}
-
-fn word_bounds_japanese(c: &mut Criterion) {
-    word_bounds(c, "japanese", "benches/texts/japanese.txt");
-}
-
-fn word_bounds_korean(c: &mut Criterion) {
-    word_bounds(c, "korean", "benches/texts/korean.txt");
-}
-
-fn word_bounds_mandarin(c: &mut Criterion) {
-    word_bounds(c, "mandarin", "benches/texts/mandarin.txt");
-}
-
-fn word_bounds_russian(c: &mut Criterion) {
-    word_bounds(c, "russian", "benches/texts/russian.txt");
-}
-
-fn word_bounds_source_code(c: &mut Criterion) {
-    word_bounds(c, "source_code", "benches/texts/source_code.txt");
-}
-
-criterion_group!(
-    benches,
-    word_bounds_arabic,
-    word_bounds_english,
-    word_bounds_hindi,
-    word_bounds_japanese,
-    word_bounds_korean,
-    word_bounds_mandarin,
-    word_bounds_russian,
-    word_bounds_source_code,
-);
-
+criterion_group!(benches, bench_all);
 criterion_main!(benches);

diff --git a/benches/words.rs b/benches/words.rs
new file mode 100644
index 0000000..86785d5
--- /dev/null
+++ b/benches/words.rs

@@ -0,0 +1,59 @@
+//! Compares the performance of `UnicodeSegmentation::unicode_words` with stdlib's UTF-8
+//! scalar-based `std::str::split_whitespace`.
+//!
+//! It is expected that `std::str::split_whitespace` is faster than
+//! `UnicodeSegmentation::unicode_words` since it does not consider the complexity of grapheme
+//! clusters. The question in this benchmark is how much slower full unicode handling is.
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+
+use std::fs;
+use unicode_segmentation::UnicodeSegmentation;
+
+const FILES: &[&str] = &[
+    "arabic",
+    "english",
+    "hindi",
+    "japanese",
+    "korean",
+    "mandarin",
+    "russian",
+    "source_code",
+];
+
+#[inline(always)]
+fn grapheme(text: &str) {
+    for w in text.unicode_words() {
+        black_box(w);
+    }
+}
+
+#[inline(always)]
+fn scalar(text: &str) {
+    for w in text.split_whitespace() {
+        black_box(w);
+    }
+}
+
+fn bench_all(c: &mut Criterion) {
+    let mut group = c.benchmark_group("words");
+
+    for file in FILES {
+        group.bench_with_input(
+            BenchmarkId::new("grapheme", file),
+            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            |b, content| b.iter(|| grapheme(content)),
+        );
+    }
+
+    for file in FILES {
+        group.bench_with_input(
+            BenchmarkId::new("scalar", file),
+            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            |b, content| b.iter(|| scalar(content)),
+        );
+    }
+}
+
+criterion_group!(benches, bench_all);
+criterion_main!(benches);
commit	3ff9de65671d35753f5ad273b6bb4da028f5ebb4	[log] [tgz]
author	Manish Goregaokar <manishsmail@gmail.com>	Tue Apr 16 15:50:35 2024
committer	GitHub <noreply@github.com>	Tue Apr 16 15:50:35 2024
tree	9e7617d2559e7fefae21aaed7765e786bad7f5dd
parent	baab923fa5d1bfbe24d3ea825d625300f700f7a0 [diff]
parent	d6a6eb848214135095e9034c24de748c4dfcbe53 [diff]