use clap::Parser;
use parse_wiki_text::{Node, Parameter};
use std::fs::File;
use std::io::{stdin, stdout, Read, Write};
use tar::Header;
extern crate bzip2;
extern crate parse_mediawiki_dump;
#[derive(Parser)]
struct Args {
/// Stop after n articles (for debugging)
#[arg(short, long)]
limit: Option,
/// Read bzip2 compressed input
#[arg(short, long)]
bzip2: bool,
/// Stream tar to stdout
#[arg(short, long)]
tar: bool,
/// Show non-fatal warnings
#[arg(short, long)]
verbose: bool,
}
fn main() {
let args = Args::parse();
let mut input: Box = Box::new(stdin());
if args.bzip2 {
input = Box::new(bzip2::read::BzDecoder::new(input))
}
let input = std::io::BufReader::new(input);
let mut archive = tar::Builder::new(stdout());
for (i, result) in parse_mediawiki_dump::parse(input).enumerate() {
match result {
Err(error) => {
eprintln!("xml format error: {}", error);
break;
}
Ok(page) => {
if page.namespace == 0
&& match &page.format {
None => false,
Some(format) => format == "text/x-wiki",
}
&& match &page.model {
None => false,
Some(model) => model == "wikitext",
}
{
let filename = title_to_filename(&page.title);
let ast = parse_wiki_text::Configuration::default().parse(&page.text);
if args.verbose {
for w in ast.warnings {
eprintln!("wikitext warning: {}", w.message.message())
}
}
let mut refs = vec![];
let mut inner = render_nodes_to_string(&ast.nodes, &mut refs);
render_refs(&mut inner, &refs);
let html = format!(
"{}{}",
page.title, inner,
);
if args.tar {
let mut header = Header::new_gnu();
header.set_size(html.as_bytes().len() as u64);
header.set_cksum();
archive
.append_data(&mut header, filename, html.as_bytes())
.unwrap();
} else {
let mut f = File::create(format!("out/{}", filename)).unwrap();
f.write_all(html.as_bytes()).unwrap()
}
} else {
eprintln!("page ignored: {:?}", page.title);
}
}
}
if Some(i) == args.limit {
break;
}
}
archive.finish().unwrap();
}
fn title_to_filename(t: &str) -> String {
t.replace("/", "_")
}
pub fn escape(text: &str) -> String {
text.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace("'", "’")
.replace("\"", """)
}
fn render_nodes(html: &mut String, refs: &mut Vec, nodes: &Vec) {
for n in nodes {
render_node(html, refs, n)
}
}
fn render_nodes_to_string(nodes: &Vec, refs: &mut Vec) -> String {
let mut html = String::new();
render_nodes(&mut html, refs, nodes);
return html;
}
fn render_node(html: &mut String, refs: &mut Vec, n: &Node) {
use std::fmt::Write;
match n {
parse_wiki_text::Node::Bold { .. } => (),
parse_wiki_text::Node::BoldItalic { .. } => (),
parse_wiki_text::Node::Category {
ordinal, target, ..
} => write!(html, "[todo]").unwrap(),
parse_wiki_text::Node::CharacterEntity { character, .. } => {
write!(html, "[todo: character]").unwrap()
}
parse_wiki_text::Node::Comment { .. } => (),
parse_wiki_text::Node::DefinitionList { items, .. } => {
write!(html, "[todo: def list]").unwrap()
}
parse_wiki_text::Node::EndTag { name, .. } => write!(html, "[todo: tag end]").unwrap(),
parse_wiki_text::Node::ExternalLink { nodes, .. } => {
write!(html, "[todo: external link]").unwrap()
}
parse_wiki_text::Node::Heading { level, nodes, .. } => write!(
html,
"{}",
render_nodes_to_string(nodes, refs)
)
.unwrap(),
parse_wiki_text::Node::HorizontalDivider { .. } => write!(html, "
").unwrap(),
parse_wiki_text::Node::Image { target, text, .. } => write!(html, "[todo: image]").unwrap(),
parse_wiki_text::Node::Italic { .. } => (),
parse_wiki_text::Node::Link { target, text, .. } => write!(
html,
"{}",
title_to_filename(target), // TODO does this always link to wikipedia?
render_nodes_to_string(text, refs)
)
.unwrap(),
parse_wiki_text::Node::MagicWord { .. } => write!(html, "[todo: magic]").unwrap(),
parse_wiki_text::Node::OrderedList { items, .. } => write!(
html,
"{}
",
items
.iter()
.map(|e| format!("{}", render_nodes_to_string(&e.nodes, refs)))
.collect::>()
.join("")
)
.unwrap(),
parse_wiki_text::Node::UnorderedList { items, .. } => write!(
html,
"",
items
.iter()
.map(|e| format!("{}", render_nodes_to_string(&e.nodes, refs)))
.collect::>()
.join("")
)
.unwrap(),
parse_wiki_text::Node::ParagraphBreak { .. } => write!(html, "
").unwrap(),
parse_wiki_text::Node::Parameter { default, name, .. } => {
write!(html, "[todo: parameter]").unwrap()
}
parse_wiki_text::Node::Preformatted { nodes, .. } => {
write!(html, "
{}
", render_nodes_to_string(nodes, refs)).unwrap()
}
parse_wiki_text::Node::Redirect { target, .. } => write!(
html,
"Redirect: {}",
title_to_filename(target),
title_to_filename(target)
)
.unwrap(),
parse_wiki_text::Node::StartTag { name, .. } => write!(html, "[todo: start tag]").unwrap(),
parse_wiki_text::Node::Table {
attributes,
captions,
rows,
..
} => write!(html, "[todo: table]").unwrap(),
parse_wiki_text::Node::Tag { name, nodes, .. } => match name.as_ref() {
"ref" => {
if !nodes.is_empty() {
let r = render_nodes_to_string(nodes, refs);
refs.push(r);
let refid = refs.len();
write!(html, "[{}]", refid, refid).unwrap();
}
}
_ => write!(html, "[todo: tag {name:?} template]").unwrap(),
},
parse_wiki_text::Node::Template {
name, parameters, ..
} => {
let name = match name.first() {
Some(Node::Text { value, .. }) => value,
_ => panic!("no"),
};
render_template(html, refs, name, parameters)
}
parse_wiki_text::Node::Text { value, .. } => write!(html, "{}", escape(value)).unwrap(),
}
}
pub fn render_template(
html: &mut String,
refs: &mut Vec,
name: &str,
params: &Vec,
) {
use std::fmt::Write;
match name {
"lang" => write!(html, "{}", render_nodes_to_string(¶ms[1].value, refs)).unwrap(),
"IPA" => write!(
html,
"{}
",
render_nodes_to_string(¶ms[0].value, refs)
)
.unwrap(),
_ => {
write!(html, "[todo: {name:?} template]").unwrap();
// eprintln!("unsupported template {name:?}");
// eprintln!("{params:?}");
}
}
}
fn render_refs(html: &mut String, refs: &Vec) {
use std::fmt::Write;
write!(html, "").unwrap();
for (i, r) in refs.iter().enumerate() {
write!(html, "- {r}
", i + 1).unwrap()
}
write!(html, "
").unwrap();
}