use clap::Parser; use parse_wiki_text::Node; use std::fs::File; use std::io::{stdin, stdout, Read, Write}; use tar::Header; extern crate bzip2; extern crate parse_mediawiki_dump; #[derive(Parser)] struct Args { /// Stop after n articles (for debugging) #[arg(short, long)] limit: Option, /// Read bzip2 compressed input #[arg(short, long)] bzip2: bool, /// Stream tar to stdout #[arg(short, long)] tar: bool, /// Show non-fatal warnings #[arg(short, long)] verbose: bool, } fn main() { let args = Args::parse(); let mut input: Box = Box::new(stdin()); if args.bzip2 { input = Box::new(bzip2::read::BzDecoder::new(input)) } let input = std::io::BufReader::new(input); let mut archive = tar::Builder::new(stdout()); for (i, result) in parse_mediawiki_dump::parse(input).enumerate() { match result { Err(error) => { eprintln!("xml format error: {}", error); break; } Ok(page) => { if page.namespace == 0 && match &page.format { None => false, Some(format) => format == "text/x-wiki", } && match &page.model { None => false, Some(model) => model == "wikitext", } { let filename = page.title.replace("/", "_"); let ast = parse_wiki_text::Configuration::default().parse(&page.text); if args.verbose { for w in ast.warnings { eprintln!("wikitext warning: {}", w.message.message()) } } let html = format!( "{}{}", page.title, render_nodes_to_string(ast.nodes) ); if args.tar { let mut header = Header::new_gnu(); header.set_size(html.as_bytes().len() as u64); header.set_cksum(); archive .append_data(&mut header, filename, html.as_bytes()) .unwrap(); } else { let mut f = File::create(format!("out/{}", filename)).unwrap(); f.write_all(html.as_bytes()).unwrap() } } else { eprintln!("page ignored: {:?}", page.title); } } } if Some(i) == args.limit { break; } } archive.finish().unwrap(); } pub fn escape(text: &str) -> String { text.replace("&", "&") .replace("<", "<") .replace(">", ">") .replace("'", "’") .replace("\"", """) } fn render_nodes(html: &mut String, nodes: Vec) { for n in nodes { render_node(html, n) } } fn render_nodes_to_string(nodes: Vec) -> String { let mut html = String::new(); render_nodes(&mut html, nodes); return html; } fn render_node(html: &mut String, n: Node) { use std::fmt::Write; match n { parse_wiki_text::Node::Bold { .. } => (), parse_wiki_text::Node::BoldItalic { .. } => (), parse_wiki_text::Node::Category { ordinal, target, .. } => write!(html, "[todo]").unwrap(), parse_wiki_text::Node::CharacterEntity { character, .. } => write!(html, "[todo]").unwrap(), parse_wiki_text::Node::Comment { .. } => (), parse_wiki_text::Node::DefinitionList { items, .. } => write!(html, "[todo]").unwrap(), parse_wiki_text::Node::EndTag { name, .. } => write!(html, "[todo]").unwrap(), parse_wiki_text::Node::ExternalLink { nodes, .. } => write!(html, "[todo]").unwrap(), parse_wiki_text::Node::Heading { level, nodes, .. } => write!( html, "{}", render_nodes_to_string(nodes) ) .unwrap(), parse_wiki_text::Node::HorizontalDivider { .. } => write!(html, "
").unwrap(), parse_wiki_text::Node::Image { target, text, .. } => write!(html, "[todo: image]").unwrap(), parse_wiki_text::Node::Italic { .. } => (), parse_wiki_text::Node::Link { target, text, .. } => write!(html, "[todo]").unwrap(), parse_wiki_text::Node::MagicWord { .. } => write!(html, "[todo]").unwrap(), parse_wiki_text::Node::OrderedList { items, .. } => write!(html, "[todo]").unwrap(), parse_wiki_text::Node::ParagraphBreak { .. } => write!(html, "[todo]").unwrap(), parse_wiki_text::Node::Parameter { default, name, .. } => write!(html, "[todo]").unwrap(), parse_wiki_text::Node::Preformatted { nodes, .. } => { write!(html, "
{}
", render_nodes_to_string(nodes)).unwrap() } parse_wiki_text::Node::Redirect { target, .. } => write!(html, "[todo]").unwrap(), parse_wiki_text::Node::StartTag { name, .. } => write!(html, "[todo]").unwrap(), parse_wiki_text::Node::Table { attributes, captions, rows, .. } => write!(html, "[todo]").unwrap(), parse_wiki_text::Node::Tag { name, nodes, .. } => write!(html, "[todo]").unwrap(), parse_wiki_text::Node::Template { name, parameters, .. } => write!(html, "[todo]").unwrap(), parse_wiki_text::Node::Text { value, .. } => write!(html, "{}", escape(value)).unwrap(), parse_wiki_text::Node::UnorderedList { items, .. } => write!(html, "[todo]").unwrap(), } }