context("rex") test_that("start works", { r <- rex( start, letter ) expect_true(grepl(r, "abcdef")) expect_false(grepl(r, "123456", perl = TRUE)) }) test_that("end works", { r <- rex( "Z", end ) expect_true(grepl(r, "abcZ")) expect_false(grepl(r, "abc")) }) test_that("version parsing works", { r <- rex( start, capture(numbers), any, capture(numbers), any, capture(numbers), any, capture(numbers), end ) expect_identical( gsub(r, "\\1 \\2 \\3 \\4", "3.1.1-1", perl = TRUE), "3 1 1 1" ) }) test_that("verbs in rex work", { r <- rex( start, "foo", zero_or_more(any), "bar", end ) expect_true(grepl(r, "fooABCbar", perl = TRUE)) expect_true(grepl(r, "foo123\tbar", perl = TRUE)) }) test_that("Simple URL parsing works", { # TODO: get these working better again ## Decompose a URL into its components. ## Example by LT (http://www.cs.uiowa.edu/~luke/R/regexp.html). x <- "http://stat.umn.edu:80/xyz" re <- "^(?:(((?:(?:(?!:).)*)+)://))?((?:(?:(?!:/).)*)+)(?:(:((?:[[:digit:]]+)+)))?(?:(/(?:.)*))?$" #m <- regexec(re, x) #m #regmatches(x, m) ## Element 3 is the protocol, 4 is the host, 6 is the port, and 7 ## is the path. We can use this to make a function for extracting the ## parts of a URL: URL_parts <- function(x) { m <- regexec(re, x) parts <- do.call(rbind, lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L))) colnames(parts) <- c("protocol","host","port","path") parts } #URL_parts(x) r <- rex( start, ## match the protocol -- may exist or may not maybe(capture( capture(except_some_of(":")), "://" )), ## match the path capture(one_or_more(not(":/"))), ## get the port maybe(capture(":", capture(numbers))), ## and the rest maybe(capture("/", anything)), end ) rbind(r = r, m = re) n <- gregexpr(r, x, perl = TRUE)[[1]] split_matches <- function(string, matches) { starts <- attr(matches, "capture.start") lengths <- attr(matches, "capture.length") ends <- starts + lengths - 1 c(string, substring(string, starts, ends)) } #split_matches(x, n) #regmatches(x, m)[[1]] #expect_equal(regmatches(x, m)[[1]], split_matches(x, n)) }) context("URL Validation") test_that("URL Validation works", { valid_chars <- rex(except_some_of(".", "/", " ", "-")) `%>%` <- magrittr::`%>%` re <- rex( start, # protocol identifier (optional) + // group(list("http", maybe("s")) %or% "ftp", "://"), # user:pass authentication (optional) maybe(non_spaces, maybe(":", zero_or_more(non_space)), "@"), #host name group(zero_or_more(valid_chars, zero_or_more("-")), one_or_more(valid_chars)), #domain name zero_or_more(".", zero_or_more(valid_chars, zero_or_more("-")), one_or_more(valid_chars)), #TLD identifier group(".", valid_chars %>% at_least(2)), # server port number (optional) maybe(":", digit %>% between(2, 5)), # resource path (optional) maybe("/", non_space %>% zero_or_more()), end ) good <- c("http://foo.com/blah_blah", "http://foo.com/blah_blah/", "http://foo.com/blah_blah_(wikipedia)", "http://foo.com/blah_blah_(wikipedia)_(again)", "http://www.example.com/wpstyle/?p=364", "https://www.example.com/foo/?bar=baz&inga=42&quux", "http://✪df.ws/123", "http://userid:password@example.com:8080", "http://userid:password@example.com:8080/", "http://userid@example.com", "http://userid@example.com/", "http://userid@example.com:8080", "http://userid@example.com:8080/", "http://userid:password@example.com", "http://userid:password@example.com/", "http://➡.ws/䨹", "http://⌘.ws", "http://⌘.ws/", "http://foo.com/blah_(wikipedia)#cite-1", "http://foo.com/blah_(wikipedia)_blah#cite-1", "http://foo.com/unicode_(✪)_in_parens", "http://foo.com/(something)?after=parens", "http://☺.damowmow.com/", "http://code.google.com/events/#&product=browser", "http://j.mp", "ftp://foo.bar/baz", "http://foo.bar/?q=Test%20URL-encoded%20stuff", "http://مثال.إختبار", "http://例子.测试", "http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com", "http://1337.net", "http://a.b-c.de", "http://223.255.255.254") bad <- c( "http://", "http://.", "http://..", "http://../", "http://?", "http://??", "http://??/", "http://#", "http://##", "http://##/", "http://foo.bar?q=Spaces should be encoded", "//", "//a", "///a", "///", "http:///a", "foo.com", "rdar://1234", "h://test", # nolint "http:// shouldfail.com", ":// should fail", "http://foo.bar/foo(bar)baz quux", "ftps://foo.bar/", "http://-error-.invalid/", "http://-a.b.co", "http://a.b-.co", "http://0.0.0.0", "http://3628126748", "http://.www.foo.bar/", "http://www.foo.bar./", "http://.www.foo.bar./") lapply(good, function(x) { expect_true(grepl(re, x, perl = TRUE), info=x) }) lapply(bad, function(x) { expect_false(grepl(re, x, perl = TRUE), info=x) }) }) context("start") test_that("matches basic characters", { expect_equal(rex(start, "f"), regex("^f")) }) test_that("escapes special characters", { expect_equal(rex(start, "."), regex("^\\.")) }) test_that("matches basic characters", { expect_equal(rex(start, "x" %>% n_times(3)), regex("^(?:x){3}")) }) test_that("matches special identifiers", { expect_equal(rex(start, number %>% n_times(2)), regex("^(?:[[:digit:]]){2}")) }) context("appending expressions") test_that("adds basic characters", { expect_equal(rex("x", "y", "z"), regex("xyz")) expect_equal(rex("x", maybe("y"), "z"), regex("x(?:y)?z")) }) test_that("escapes special characters", { expect_equal(rex(numbers %>% between(0, 2), ".", "$", end), regex("(?:[[:digit:]]+){0,2}\\.\\$$")) }) context("shortcuts - end") test_that("matches basic characters", { expect_equal(rex("x", "y", end), regex("xy$")) }) test_that("escapes special characters", { expect_equal(rex("x", "$", end), regex("x\\$$")) }) context("general regex") test_that("returns a well-formed regex", { expect_equal(rex(start, "w", "x" %or% "y", "z", end), regex("^w(?:x|y)z$")) }) context("rex examples") re <- rex(start, number %>% n_times(3), "-", letter %>% n_times(2), maybe("#"), "a" %or% "b", "c" %>% between(2, 4), "$", end ) expect_true(grepl(re, "123-xy#accc$", perl=TRUE)) expect_true(grepl(re, "999-dfbcc$")) expect_false(grepl(re, "000-df#baccccccccc$")) expect_false(grepl(re, "444-dd3ac$")) context("issues") test_that("#11 Modifiers and named character classes", { p <- rex(none_of(alpha)) expect_true(grepl(p, "6", perl = TRUE)) })
Generated by dwww version 1.15 on Sun Jun 23 04:51:32 CEST 2024.