From 45ac5e34fcefa38552804b124fcae3c20ef22e41 Mon Sep 17 00:00:00 2001 From: Serg Gini Date: Mon, 5 Jun 2023 20:51:47 +0300 Subject: [PATCH 1/2] Optimized solution based on R base Uses PCRE engine and Bytes for text data, instead of UTF --- Dockerfile | 4 ++++ r/README.md | 8 ++++++++ r/benchmark.R | 28 ++++++++++++++++++++++++++++ run-benchmarks.php | 1 + 4 files changed, 41 insertions(+) create mode 100644 r/README.md create mode 100644 r/benchmark.R diff --git a/Dockerfile b/Dockerfile index 1af642f..357b17d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -132,6 +132,10 @@ RUN wget -q https://downloads.python.org/pypy/pypy3.6-v7.3.3-linux64.tar.bz2 -O ln -s /opt/pypy3/bin/pypy3 /usr/local/bin/pypy3 && \ rm pypy3.6-v7.3.3-linux64.tar.bz2 +## R +RUN apt-get install -yq --no-install-recommends \ + r-base r-cran-readr + ## Ruby RUN apt-get install -yq --no-install-recommends \ ruby-full diff --git a/r/README.md b/r/README.md new file mode 100644 index 0000000..4a9dd44 --- /dev/null +++ b/r/README.md @@ -0,0 +1,8 @@ +# R Regex Benchmark + +## How to run + +```sh +# R +RScript --vanilla benchmark.R +``` diff --git a/r/benchmark.R b/r/benchmark.R new file mode 100644 index 0000000..49591fd --- /dev/null +++ b/r/benchmark.R @@ -0,0 +1,28 @@ +#!/usr/bin/env Rscript +library(readr) + +measure <- function(data, pattern) { + start <- Sys.time() + res <- length(regmatches(data, gregexpr(pattern, data, perl = TRUE, useBytes = TRUE))[[1]]) + end <- Sys.time() + s <- end - start + out <- paste0(format(as.numeric(s)*1e3, digits = 3), " - ", res) + print(out) +} + +args = commandArgs(trailingOnly=TRUE) + +if (length(args)==0) { + stop("At least one argument must be supplied (input file).n", call.=FALSE) +} + +file_str <- read_file(args[1]) + +# Email +measure(file_str, r"{[\w\\.+-]+@[\w\.-]+\.[\w\.-]+}") + +# URI +measure(file_str, r"{[\w]+://[^/\s?#]+[^\s?#]+(?:\?[^\s#]*)?(?:#[^\s]*)?}") + +# IPv4 +measure(file_str, r"{(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])}") diff --git a/run-benchmarks.php b/run-benchmarks.php index 86756b0..7e80cc3 100644 --- a/run-benchmarks.php +++ b/run-benchmarks.php @@ -46,6 +46,7 @@ 'Python 3' => 'python3.6 python/benchmark.py', 'Python PyPy2' => 'pypy2 python/benchmark.py', 'Python PyPy3' => 'pypy3 python/benchmark.py', + 'R' => 'RScript --vanilla r/benchmark.R', 'Ruby' => 'ruby ruby/benchmark.rb', 'Rust' => 'rust/target/release/benchmark', ]; From 875de7e5c9dfa73b5bd7642c9cfc99a13ae79611 Mon Sep 17 00:00:00 2001 From: Serg Gini Date: Mon, 5 Jun 2023 20:58:21 +0300 Subject: [PATCH 2/2] Fixed typo --- r/README.md | 2 +- run-benchmarks.php | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/r/README.md b/r/README.md index 4a9dd44..77520cd 100644 --- a/r/README.md +++ b/r/README.md @@ -4,5 +4,5 @@ ```sh # R -RScript --vanilla benchmark.R +Rscript --vanilla benchmark.R ``` diff --git a/run-benchmarks.php b/run-benchmarks.php index 7e80cc3..78e35b0 100644 --- a/run-benchmarks.php +++ b/run-benchmarks.php @@ -46,7 +46,7 @@ 'Python 3' => 'python3.6 python/benchmark.py', 'Python PyPy2' => 'pypy2 python/benchmark.py', 'Python PyPy3' => 'pypy3 python/benchmark.py', - 'R' => 'RScript --vanilla r/benchmark.R', + 'R' => 'Rscript --vanilla r/benchmark.R', 'Ruby' => 'ruby ruby/benchmark.rb', 'Rust' => 'rust/target/release/benchmark', ];