diff options
-rw-r--r-- | .gitignore | 3 | ||||
-rw-r--r-- | Cargo.lock | 486 | ||||
-rw-r--r-- | Cargo.toml | 11 | ||||
-rw-r--r-- | src/main.rs | 155 |
4 files changed, 655 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..17e10f4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/target +/data +/out diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..5c578d1 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,486 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "addr2line" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9ecd88a8c8378ca913a680cd98f0f13ac67383d35993f86c90a70e3f137816b" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "backtrace" +version = "0.3.66" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab84319d616cfb654d03394f38ab7e6f0919e181b1b57e1fd15e7fb4077d9a7" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bzip2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6afcd980b5f3a45017c57e57a2fcccbb351cc43a356ce117ef760ef8052b89b0" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "cc" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clap" +version = "4.0.0-rc.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7233bf306993c874a6edc363281e83770889877c9d5ee7f656249c65d7e7aa62" +dependencies = [ + "atty", + "bitflags", + "clap_derive", + "clap_lex", + "once_cell", + "strsim", + "termcolor", +] + +[[package]] +name = "clap_derive" +version = "4.0.0-rc.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51eef4d62724bf369e9ca7458cfde0c55263708b4552020058fba384864e8c23" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d4198f73e42b4936b35b5bb248d81d2b595ecb170da0bac7655c54eedfa8da8" +dependencies = [ + "os_str_bytes", +] + +[[package]] +name = "encoding_rs" +version = "0.8.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9852635589dc9f9ea1b6fe9f05b50ef208c85c834a562f0c6abb1c475736ec2b" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "failure" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d32e9bd16cc02eae7db7ef620b392808b89f6a5e16bb3497d159c6b92a0f4f86" +dependencies = [ + "backtrace", + "failure_derive", +] + +[[package]] +name = "failure_derive" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa4da3c766cd7a0db8242e326e9e4e081edd567072893ed320008189715366a4" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "filetime" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94a7bbaa59354bc20dd75b67f23e2797b4490e9d6928203fb105c79e448c86c" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "windows-sys", +] + +[[package]] +name = "gimli" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22030e2c5a68ec659fde1e949a745124b48e6fa8b045b7ed5bd1fe4ccc5c4e5d" + +[[package]] +name = "heck" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "libc" +version = "0.2.133" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f80d65747a3e43d1596c7c5492d95d5edddaabd45a7fcdb02b95f644164966" + +[[package]] +name = "log" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "miniz_oxide" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96590ba8f175222643a85693f33d26e9c8a015f599c216509b1a6894af675d34" +dependencies = [ + "adler", +] + +[[package]] +name = "object" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21158b2c33aa6d4561f1c0a6ea283ca92bc54802a93b263e910746d679a7eb53" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1" + +[[package]] +name = "os_str_bytes" +version = "6.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff" + +[[package]] +name = "parse_mediawiki_dump" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06c580145717dd85cd6ae58860d242364f8b111452a6bae5fd4580dba0cc77e6" +dependencies = [ + "quick-xml", +] + +[[package]] +name = "parse_wiki_text" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd59a245c58efa02bd73c9462fd9d4c28952d650092c6bcba43b60b4707171dd" + +[[package]] +name = "pkg-config" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae" + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bd7356a8122b6c4a24a82b278680c73357984ca2fc79a0f9fa6dea7dced7c58" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quick-xml" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d8065cbb01701c11cc195cde85cbf39d1c6a80705b67a157ebb3042e0e5777f" +dependencies = [ + "encoding_rs", + "failure", + "log", + "memchr", +] + +[[package]] +name = "quote" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "redox_syscall" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +dependencies = [ + "bitflags", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "syn" +version = "1.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "synstructure" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "unicode-xid", +] + +[[package]] +name = "tar" +version = "0.4.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6" +dependencies = [ + "filetime", + "libc", + "xattr", +] + +[[package]] +name = "termcolor" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "unicode-ident" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd" + +[[package]] +name = "unicode-xid" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wikiviewer" +version = "0.1.0" +dependencies = [ + "bzip2", + "clap", + "parse_mediawiki_dump", + "parse_wiki_text", + "tar", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2" +dependencies = [ + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47" + +[[package]] +name = "windows_i686_gnu" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6" + +[[package]] +name = "windows_i686_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680" + +[[package]] +name = "xattr" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc" +dependencies = [ + "libc", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..9e50d9f --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "wikiviewer" +version = "0.1.0" +edition = "2021" + +[dependencies] +parse_mediawiki_dump = "0.1.0" +parse_wiki_text = "0.1.5" +bzip2 = "0.4.3" +tar = "0.4.38" +clap = { version = "4.0.0-rc.3", features = ["derive"] } diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..e973048 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,155 @@ +use clap::Parser; +use parse_wiki_text::Node; +use std::fs::File; +use std::io::{stdin, stdout, Read, Write}; +use tar::Header; + +extern crate bzip2; +extern crate parse_mediawiki_dump; + +#[derive(Parser)] +struct Args { + /// Stop after n articles (for debugging) + #[arg(short, long)] + limit: Option<usize>, + /// Read bzip2 compressed input + #[arg(short, long)] + bzip2: bool, + /// Stream tar to stdout + #[arg(short, long)] + tar: bool, + /// Show non-fatal warnings + #[arg(short, long)] + verbose: bool, +} + +fn main() { + let args = Args::parse(); + + let mut input: Box<dyn Read> = Box::new(stdin()); + if args.bzip2 { + input = Box::new(bzip2::read::BzDecoder::new(input)) + } + + let input = std::io::BufReader::new(input); + let mut archive = tar::Builder::new(stdout()); + + for (i, result) in parse_mediawiki_dump::parse(input).enumerate() { + match result { + Err(error) => { + eprintln!("xml format error: {}", error); + break; + } + Ok(page) => { + if page.namespace == 0 + && match &page.format { + None => false, + Some(format) => format == "text/x-wiki", + } + && match &page.model { + None => false, + Some(model) => model == "wikitext", + } + { + let filename = page.title.replace("/", "_"); + + let ast = parse_wiki_text::Configuration::default().parse(&page.text); + if args.verbose { + for w in ast.warnings { + eprintln!("wikitext warning: {}", w.message.message()) + } + } + + let html = format!( + "<!DOCTYPE html><html><head><title>{}</title></head><body>{}</body></html>", + page.title, + render_nodes_to_string(ast.nodes) + ); + + if args.tar { + let mut header = Header::new_gnu(); + header.set_size(html.as_bytes().len() as u64); + header.set_cksum(); + archive + .append_data(&mut header, filename, html.as_bytes()) + .unwrap(); + } else { + let mut f = File::create(format!("out/{}", filename)).unwrap(); + f.write_all(html.as_bytes()).unwrap() + } + } else { + eprintln!("page ignored: {:?}", page.title); + } + } + } + if Some(i) == args.limit { + break; + } + } + archive.finish().unwrap(); +} + +pub fn escape(text: &str) -> String { + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace("'", "’") + .replace("\"", """) +} + +fn render_nodes(html: &mut String, nodes: Vec<Node>) { + for n in nodes { + render_node(html, n) + } +} +fn render_nodes_to_string(nodes: Vec<Node>) -> String { + let mut html = String::new(); + render_nodes(&mut html, nodes); + return html; +} +fn render_node(html: &mut String, n: Node) { + use std::fmt::Write; + match n { + parse_wiki_text::Node::Bold { .. } => (), + parse_wiki_text::Node::BoldItalic { .. } => (), + parse_wiki_text::Node::Category { + ordinal, target, .. + } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::CharacterEntity { character, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::Comment { .. } => (), + parse_wiki_text::Node::DefinitionList { items, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::EndTag { name, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::ExternalLink { nodes, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::Heading { level, nodes, .. } => write!( + html, + "<h{level}>{}</h{level}>", + render_nodes_to_string(nodes) + ) + .unwrap(), + parse_wiki_text::Node::HorizontalDivider { .. } => write!(html, "<hr>").unwrap(), + parse_wiki_text::Node::Image { target, text, .. } => write!(html, "[todo: image]").unwrap(), + parse_wiki_text::Node::Italic { .. } => (), + parse_wiki_text::Node::Link { target, text, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::MagicWord { .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::OrderedList { items, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::ParagraphBreak { .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::Parameter { default, name, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::Preformatted { nodes, .. } => { + write!(html, "<pre>{}</pre>", render_nodes_to_string(nodes)).unwrap() + } + parse_wiki_text::Node::Redirect { target, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::StartTag { name, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::Table { + attributes, + captions, + rows, + .. + } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::Tag { name, nodes, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::Template { + name, parameters, .. + } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::Text { value, .. } => write!(html, "{}", escape(value)).unwrap(), + parse_wiki_text::Node::UnorderedList { items, .. } => write!(html, "[todo]").unwrap(), + } +} |