From 29657436563da9979d6d2273ccf9edb22f2c031c Mon Sep 17 00:00:00 2001 From: taoky Date: Wed, 13 Nov 2024 03:32:35 +0800 Subject: [PATCH] parser: Update nginx parser for nodejs/dist cases --- fixtures/nodejs/v4.9.1/index.html | 59 +++++++++++++++++ src/parser/mod.rs | 8 ++- src/parser/nginx.rs | 102 ++++++++++++++++++++++++------ 3 files changed, 147 insertions(+), 22 deletions(-) create mode 100644 fixtures/nodejs/v4.9.1/index.html diff --git a/fixtures/nodejs/v4.9.1/index.html b/fixtures/nodejs/v4.9.1/index.html new file mode 100644 index 0000000..a35b670 --- /dev/null +++ b/fixtures/nodejs/v4.9.1/index.html @@ -0,0 +1,59 @@ + + + Index of /dist/v4.9.1/ + + + +

Index of /dist/v4.9.1/


../
+docs/                                                             -                   -
+win-x64/                                                          -                   -
+win-x86/                                                          -                   -
+SHASUMS256.txt.asc                                 04-Nov-2024 17:40               4.1 KB
+SHASUMS256.txt.sig                                 04-Nov-2024 17:40                310 B
+SHASUMS256.txt                                     04-Nov-2024 17:40               3.6 KB
+node-v4.9.1-darwin-x64.tar.gz                      30-Oct-2024 18:21                10 MB
+node-v4.9.1-darwin-x64.tar.xz                      04-Nov-2024 17:40               7.1 MB
+node-v4.9.1-headers.tar.gz                         04-Nov-2024 17:40               471 KB
+node-v4.9.1-headers.tar.xz                         04-Nov-2024 17:40               342 KB
+node-v4.9.1-linux-arm64.tar.gz                     30-Oct-2024 18:21                12 MB
+node-v4.9.1-linux-arm64.tar.xz                     04-Nov-2024 17:40               7.7 MB
+node-v4.9.1-linux-armv6l.tar.gz                    30-Oct-2024 18:21                11 MB
+node-v4.9.1-linux-armv6l.tar.xz                    04-Nov-2024 17:40               7.3 MB
+node-v4.9.1-linux-armv7l.tar.gz                    30-Oct-2024 18:21                11 MB
+node-v4.9.1-linux-armv7l.tar.xz                    04-Nov-2024 17:40               7.3 MB
+node-v4.9.1-linux-ppc64.tar.gz                     30-Oct-2024 18:21                12 MB
+node-v4.9.1-linux-ppc64.tar.xz                     04-Nov-2024 17:40               7.5 MB
+node-v4.9.1-linux-ppc64le.tar.gz                   30-Oct-2024 18:21                12 MB
+node-v4.9.1-linux-ppc64le.tar.xz                   04-Nov-2024 17:40               7.6 MB
+node-v4.9.1-linux-x64.tar.gz                       30-Oct-2024 18:21                12 MB
+node-v4.9.1-linux-x64.tar.xz                       04-Nov-2024 17:40               8.2 MB
+node-v4.9.1-linux-x86.tar.gz                       30-Oct-2024 18:21                12 MB
+node-v4.9.1-linux-x86.tar.xz                       04-Nov-2024 17:40               7.8 MB
+node-v4.9.1-sunos-x64.tar.gz                       30-Oct-2024 18:21                13 MB
+node-v4.9.1-sunos-x64.tar.xz                       30-Oct-2024 18:21               8.4 MB
+node-v4.9.1-sunos-x86.tar.gz                       30-Oct-2024 18:21                12 MB
+node-v4.9.1-sunos-x86.tar.xz                       04-Nov-2024 17:40               7.7 MB
+node-v4.9.1-win-x64.7z                             04-Nov-2024 17:40               6.1 MB
+node-v4.9.1-win-x64.zip                            30-Oct-2024 18:21                11 MB
+node-v4.9.1-win-x86.7z                             04-Nov-2024 17:40               5.4 MB
+node-v4.9.1-win-x86.zip                            30-Oct-2024 18:21               9.6 MB
+node-v4.9.1-x64.msi                                30-Oct-2024 18:21                11 MB
+node-v4.9.1-x86.msi                                30-Oct-2024 18:21              10.0 MB
+node-v4.9.1.pkg                                    30-Oct-2024 18:21                13 MB
+node-v4.9.1.tar.gz                                 30-Oct-2024 18:21                23 MB
+node-v4.9.1.tar.xz                                 30-Oct-2024 18:21                13 MB
+

+ diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 868b51b..56d2570 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -211,7 +211,7 @@ fn assert_if_url_has_no_trailing_slash(url: &Url) { ); } -fn get_real_name_from_href(href: &str) -> String { +fn get_last_part_from_href(href: &str) -> &str { // Remove trailing slashes for correct name extraction. let trimmed = href.trim_end_matches('/'); @@ -219,6 +219,12 @@ fn get_real_name_from_href(href: &str) -> String { let last_slash_pos = trimmed.rfind('/').map(|pos| pos + 1).unwrap_or(0); let after_last_slash = &trimmed[last_slash_pos..]; + return after_last_slash; +} + +fn get_real_name_from_href(href: &str) -> String { + let after_last_slash = get_last_part_from_href(href); + // TODO: this might have issues (inconsistent with other impls) // Find the position of the first '?' and take the substring before it. diff --git a/src/parser/nginx.rs b/src/parser/nginx.rs index 79e3d62..c48ea8c 100644 --- a/src/parser/nginx.rs +++ b/src/parser/nginx.rs @@ -1,5 +1,5 @@ /// A parser both suitable for default nginx autoindex and apache f1 format. -use crate::listing::{FileSize, FileType, ListItem}; +use crate::listing::{FileSize, FileType, ListItem, SizeUnit}; use chrono::NaiveDateTime; use scraper::{Html, Selector}; use tracing::debug; @@ -56,7 +56,8 @@ impl Parser for NginxListingParser { get_real_name_from_href(href) } else { // A compromise for apache server (they will NOT url-encode the filename) - href.to_string() + // Just find the last '/' (if exists), and take substring after that + get_last_part_from_href(href).to_string() }; let href = url.join(href)?; @@ -83,26 +84,49 @@ impl Parser for NginxListingParser { .to_string(); let metadata_raw = metadata_raw.trim(); debug!("{:?}", metadata_raw); + // if it's a directory, and metadata are "- -", do some special handling for the date fmt... + // as directory mtime is useless, and nodejs gives us something like this... + let mut skip_date = false; + if type_ == FileType::Directory + && metadata_raw + .chars() + .filter(|c| !c.is_whitespace()) + .collect::() + == "--" + { + skip_date = true; + } // guess date format... - if date_fmt.is_none() { - let (f, r) = guess_date_fmt(metadata_raw); - date_fmt = Some(f); - date_regex = Some(Regex::new(&format!(r"({})\s+([\d\.\-]+ ?[kKMGB]*)$", r))?); - debug!("date_fmt: {:?} date_regex: {:?}", date_fmt, date_regex) + let date; + let size; + if !skip_date { + if date_fmt.is_none() { + let (f, r) = guess_date_fmt(metadata_raw); + date_fmt = Some(f); + date_regex = Some(Regex::new(&format!(r"({})\s+([\d\.\-]+ ?[kKMGB]*)$", r))?); + debug!("date_fmt: {:?} date_regex: {:?}", date_fmt, date_regex) + } + let metadata = + date_regex + .clone() + .unwrap() + .captures(metadata_raw) + .ok_or(anyhow!( + "Get '{}' for {} ({}) metadata, is this a nginx page?", + metadata_raw, + name, + href + ))?; + date = NaiveDateTime::parse_from_str( + metadata.get(1).unwrap().as_str(), + &date_fmt.clone().unwrap(), + )?; + size = metadata.get(2).unwrap().as_str(); + } else { + date = NaiveDateTime::UNIX_EPOCH; + size = "-"; } - let metadata = date_regex - .clone() - .unwrap() - .captures(metadata_raw) - .ok_or(anyhow!( - "Get '{}' for {} ({}) metadata, is this a nginx page?", - metadata_raw, - name, - href - ))?; - let date = metadata.get(1).unwrap().as_str(); - let date = NaiveDateTime::parse_from_str(date, &date_fmt.clone().unwrap())?; - let size = metadata.get(2).unwrap().as_str(); + debug!("{} {} {:?} {} {:?}", href, name, type_, date, size); items.push(ListItem::new( href, @@ -115,9 +139,14 @@ impl Parser for NginxListingParser { || size.contains('K') || size.contains('M') || size.contains('G') + || size.contains('B') { let (n_size, unit) = FileSize::get_humanized(size); - Some(FileSize::HumanizedBinary(n_size, unit)) + if unit != SizeUnit::B { + Some(FileSize::HumanizedBinary(n_size, unit)) + } else { + Some(FileSize::Precise(n_size as u64)) // workaround + } } else { let n_size = size.parse::().unwrap(); Some(FileSize::Precise(n_size)) @@ -322,4 +351,35 @@ mod tests { _ => unreachable!(), } } + + #[test] + fn test_nodejs() { + let context = init_async_context(); + let items = NginxListingParser::default() + .get_list( + &context, + &url::Url::parse("http://localhost:1921/nodejs/v4.9.1/").unwrap(), + ) + .unwrap(); + match items { + ListResult::List(items) => { + assert_eq!(items.len(), 37); + assert_eq!(items[0].name, "docs"); + assert_eq!(items[0].type_, FileType::Directory); + assert_eq!(items[0].size, None); + assert_eq!( + items[0].mtime, + NaiveDateTime::UNIX_EPOCH, // No mtime + ); + assert_eq!(items[3].name, "SHASUMS256.txt.asc"); + assert_eq!(items[3].type_, FileType::File); + assert_eq!(items[3].size, Some(FileSize::HumanizedBinary(4.1, SizeUnit::K))); + assert_eq!( + items[3].mtime, + NaiveDateTime::parse_from_str("2024-11-04 17:40", "%Y-%m-%d %H:%M").unwrap() + ); + } + _ => unreachable!(), + } + } }