use clap::Parser; use parse_wiki_text::{Node, Parameter}; use std::fs::File; use std::io::{stdin, stdout, Read, Write}; use tar::Header; extern crate bzip2; extern crate parse_mediawiki_dump; #[derive(Parser)] struct Args { /// Stop after n articles (for debugging) #[arg(short, long)] limit: Option, /// Read bzip2 compressed input #[arg(short, long)] bzip2: bool, /// Stream tar to stdout #[arg(short, long)] tar: bool, /// Show non-fatal warnings #[arg(short, long)] verbose: bool, } fn main() { let args = Args::parse(); let mut input: Box = Box::new(stdin()); if args.bzip2 { input = Box::new(bzip2::read::BzDecoder::new(input)) } let input = std::io::BufReader::new(input); let mut archive = tar::Builder::new(stdout()); for (i, result) in parse_mediawiki_dump::parse(input).enumerate() { match result { Err(error) => { eprintln!("xml format error: {}", error); break; } Ok(page) => { if page.namespace == 0 && match &page.format { None => false, Some(format) => format == "text/x-wiki", } && match &page.model { None => false, Some(model) => model == "wikitext", } { let filename = title_to_filename(&page.title); let ast = parse_wiki_text::Configuration::default().parse(&page.text); if args.verbose { for w in ast.warnings { eprintln!("wikitext warning: {}", w.message.message()) } } let mut refs = vec![]; let mut inner = render_nodes_to_string(&ast.nodes, &mut refs); render_refs(&mut inner, &refs); let html = format!( "{}{}", page.title, inner, ); if args.tar { let mut header = Header::new_gnu(); header.set_size(html.as_bytes().len() as u64); header.set_cksum(); archive .append_data(&mut header, filename, html.as_bytes()) .unwrap(); } else { let mut f = File::create(format!("out/{}", filename)).unwrap(); f.write_all(html.as_bytes()).unwrap() } } else { eprintln!("page ignored: {:?}", page.title); } } } if Some(i) == args.limit { break; } } archive.finish().unwrap(); } fn title_to_filename(t: &str) -> String { t.replace("/", "_") } pub fn escape(text: &str) -> String { text.replace("&", "&") .replace("<", "<") .replace(">", ">") .replace("'", "’") .replace("\"", """) } fn render_nodes(html: &mut String, refs: &mut Vec, nodes: &Vec) { for n in nodes { render_node(html, refs, n) } } fn render_nodes_to_string(nodes: &Vec, refs: &mut Vec) -> String { let mut html = String::new(); render_nodes(&mut html, refs, nodes); return html; } fn render_node(html: &mut String, refs: &mut Vec, n: &Node) { use std::fmt::Write; match n { parse_wiki_text::Node::Bold { .. } => (), parse_wiki_text::Node::BoldItalic { .. } => (), parse_wiki_text::Node::Category { ordinal, target, .. } => write!(html, "[todo]").unwrap(), parse_wiki_text::Node::CharacterEntity { character, .. } => { write!(html, "[todo: character]").unwrap() } parse_wiki_text::Node::Comment { .. } => (), parse_wiki_text::Node::DefinitionList { items, .. } => { write!(html, "[todo: def list]").unwrap() } parse_wiki_text::Node::EndTag { name, .. } => write!(html, "[todo: tag end]").unwrap(), parse_wiki_text::Node::ExternalLink { nodes, .. } => { write!(html, "[todo: external link]").unwrap() } parse_wiki_text::Node::Heading { level, nodes, .. } => write!( html, "{}", render_nodes_to_string(nodes, refs) ) .unwrap(), parse_wiki_text::Node::HorizontalDivider { .. } => write!(html, "
").unwrap(), parse_wiki_text::Node::Image { target, text, .. } => write!(html, "[todo: image]").unwrap(), parse_wiki_text::Node::Italic { .. } => (), parse_wiki_text::Node::Link { target, text, .. } => write!( html, "{}", title_to_filename(target), // TODO does this always link to wikipedia? render_nodes_to_string(text, refs) ) .unwrap(), parse_wiki_text::Node::MagicWord { .. } => write!(html, "[todo: magic]").unwrap(), parse_wiki_text::Node::OrderedList { items, .. } => write!( html, "
    {}
", items .iter() .map(|e| format!("
  • {}
  • ", render_nodes_to_string(&e.nodes, refs))) .collect::>() .join("") ) .unwrap(), parse_wiki_text::Node::UnorderedList { items, .. } => write!( html, "
      {}
    ", items .iter() .map(|e| format!("
  • {}
  • ", render_nodes_to_string(&e.nodes, refs))) .collect::>() .join("") ) .unwrap(), parse_wiki_text::Node::ParagraphBreak { .. } => write!(html, "

    ").unwrap(), parse_wiki_text::Node::Parameter { default, name, .. } => { write!(html, "[todo: parameter]").unwrap() } parse_wiki_text::Node::Preformatted { nodes, .. } => { write!(html, "

    {}
    ", render_nodes_to_string(nodes, refs)).unwrap() } parse_wiki_text::Node::Redirect { target, .. } => write!( html, "Redirect: {}", title_to_filename(target), title_to_filename(target) ) .unwrap(), parse_wiki_text::Node::StartTag { name, .. } => write!(html, "[todo: start tag]").unwrap(), parse_wiki_text::Node::Table { attributes, captions, rows, .. } => write!(html, "[todo: table]").unwrap(), parse_wiki_text::Node::Tag { name, nodes, .. } => match name.as_ref() { "ref" => { if !nodes.is_empty() { let r = render_nodes_to_string(nodes, refs); refs.push(r); let refid = refs.len(); write!(html, "[{}]", refid, refid).unwrap(); } } _ => write!(html, "[todo: tag {name:?} template]").unwrap(), }, parse_wiki_text::Node::Template { name, parameters, .. } => { let name = match name.first() { Some(Node::Text { value, .. }) => value, _ => panic!("no"), }; render_template(html, refs, name, parameters) } parse_wiki_text::Node::Text { value, .. } => write!(html, "{}", escape(value)).unwrap(), } } pub fn render_template( html: &mut String, refs: &mut Vec, name: &str, params: &Vec, ) { use std::fmt::Write; match name { "lang" => write!(html, "{}", render_nodes_to_string(¶ms[1].value, refs)).unwrap(), "IPA" => write!( html, "{}", render_nodes_to_string(¶ms[0].value, refs) ) .unwrap(), _ => { write!(html, "[todo: {name:?} template]").unwrap(); // eprintln!("unsupported template {name:?}"); // eprintln!("{params:?}"); } } } fn render_refs(html: &mut String, refs: &Vec) { use std::fmt::Write; write!(html, "
      ").unwrap(); for (i, r) in refs.iter().enumerate() { write!(html, "
    1. {r}
    2. ", i + 1).unwrap() } write!(html, "
    ").unwrap(); }