diff options
Diffstat (limited to 'src/main.rs')
-rw-r--r-- | src/main.rs | 155 |
1 files changed, 155 insertions, 0 deletions
diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..e973048 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,155 @@ +use clap::Parser; +use parse_wiki_text::Node; +use std::fs::File; +use std::io::{stdin, stdout, Read, Write}; +use tar::Header; + +extern crate bzip2; +extern crate parse_mediawiki_dump; + +#[derive(Parser)] +struct Args { + /// Stop after n articles (for debugging) + #[arg(short, long)] + limit: Option<usize>, + /// Read bzip2 compressed input + #[arg(short, long)] + bzip2: bool, + /// Stream tar to stdout + #[arg(short, long)] + tar: bool, + /// Show non-fatal warnings + #[arg(short, long)] + verbose: bool, +} + +fn main() { + let args = Args::parse(); + + let mut input: Box<dyn Read> = Box::new(stdin()); + if args.bzip2 { + input = Box::new(bzip2::read::BzDecoder::new(input)) + } + + let input = std::io::BufReader::new(input); + let mut archive = tar::Builder::new(stdout()); + + for (i, result) in parse_mediawiki_dump::parse(input).enumerate() { + match result { + Err(error) => { + eprintln!("xml format error: {}", error); + break; + } + Ok(page) => { + if page.namespace == 0 + && match &page.format { + None => false, + Some(format) => format == "text/x-wiki", + } + && match &page.model { + None => false, + Some(model) => model == "wikitext", + } + { + let filename = page.title.replace("/", "_"); + + let ast = parse_wiki_text::Configuration::default().parse(&page.text); + if args.verbose { + for w in ast.warnings { + eprintln!("wikitext warning: {}", w.message.message()) + } + } + + let html = format!( + "<!DOCTYPE html><html><head><title>{}</title></head><body>{}</body></html>", + page.title, + render_nodes_to_string(ast.nodes) + ); + + if args.tar { + let mut header = Header::new_gnu(); + header.set_size(html.as_bytes().len() as u64); + header.set_cksum(); + archive + .append_data(&mut header, filename, html.as_bytes()) + .unwrap(); + } else { + let mut f = File::create(format!("out/{}", filename)).unwrap(); + f.write_all(html.as_bytes()).unwrap() + } + } else { + eprintln!("page ignored: {:?}", page.title); + } + } + } + if Some(i) == args.limit { + break; + } + } + archive.finish().unwrap(); +} + +pub fn escape(text: &str) -> String { + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace("'", "’") + .replace("\"", """) +} + +fn render_nodes(html: &mut String, nodes: Vec<Node>) { + for n in nodes { + render_node(html, n) + } +} +fn render_nodes_to_string(nodes: Vec<Node>) -> String { + let mut html = String::new(); + render_nodes(&mut html, nodes); + return html; +} +fn render_node(html: &mut String, n: Node) { + use std::fmt::Write; + match n { + parse_wiki_text::Node::Bold { .. } => (), + parse_wiki_text::Node::BoldItalic { .. } => (), + parse_wiki_text::Node::Category { + ordinal, target, .. + } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::CharacterEntity { character, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::Comment { .. } => (), + parse_wiki_text::Node::DefinitionList { items, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::EndTag { name, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::ExternalLink { nodes, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::Heading { level, nodes, .. } => write!( + html, + "<h{level}>{}</h{level}>", + render_nodes_to_string(nodes) + ) + .unwrap(), + parse_wiki_text::Node::HorizontalDivider { .. } => write!(html, "<hr>").unwrap(), + parse_wiki_text::Node::Image { target, text, .. } => write!(html, "[todo: image]").unwrap(), + parse_wiki_text::Node::Italic { .. } => (), + parse_wiki_text::Node::Link { target, text, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::MagicWord { .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::OrderedList { items, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::ParagraphBreak { .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::Parameter { default, name, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::Preformatted { nodes, .. } => { + write!(html, "<pre>{}</pre>", render_nodes_to_string(nodes)).unwrap() + } + parse_wiki_text::Node::Redirect { target, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::StartTag { name, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::Table { + attributes, + captions, + rows, + .. + } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::Tag { name, nodes, .. } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::Template { + name, parameters, .. + } => write!(html, "[todo]").unwrap(), + parse_wiki_text::Node::Text { value, .. } => write!(html, "{}", escape(value)).unwrap(), + parse_wiki_text::Node::UnorderedList { items, .. } => write!(html, "[todo]").unwrap(), + } +} |