aboutsummaryrefslogtreecommitdiff
path: root/src/main.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/main.rs')
-rw-r--r--src/main.rs155
1 files changed, 155 insertions, 0 deletions
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..e973048
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,155 @@
+use clap::Parser;
+use parse_wiki_text::Node;
+use std::fs::File;
+use std::io::{stdin, stdout, Read, Write};
+use tar::Header;
+
+extern crate bzip2;
+extern crate parse_mediawiki_dump;
+
+#[derive(Parser)]
+struct Args {
+ /// Stop after n articles (for debugging)
+ #[arg(short, long)]
+ limit: Option<usize>,
+ /// Read bzip2 compressed input
+ #[arg(short, long)]
+ bzip2: bool,
+ /// Stream tar to stdout
+ #[arg(short, long)]
+ tar: bool,
+ /// Show non-fatal warnings
+ #[arg(short, long)]
+ verbose: bool,
+}
+
+fn main() {
+ let args = Args::parse();
+
+ let mut input: Box<dyn Read> = Box::new(stdin());
+ if args.bzip2 {
+ input = Box::new(bzip2::read::BzDecoder::new(input))
+ }
+
+ let input = std::io::BufReader::new(input);
+ let mut archive = tar::Builder::new(stdout());
+
+ for (i, result) in parse_mediawiki_dump::parse(input).enumerate() {
+ match result {
+ Err(error) => {
+ eprintln!("xml format error: {}", error);
+ break;
+ }
+ Ok(page) => {
+ if page.namespace == 0
+ && match &page.format {
+ None => false,
+ Some(format) => format == "text/x-wiki",
+ }
+ && match &page.model {
+ None => false,
+ Some(model) => model == "wikitext",
+ }
+ {
+ let filename = page.title.replace("/", "_");
+
+ let ast = parse_wiki_text::Configuration::default().parse(&page.text);
+ if args.verbose {
+ for w in ast.warnings {
+ eprintln!("wikitext warning: {}", w.message.message())
+ }
+ }
+
+ let html = format!(
+ "<!DOCTYPE html><html><head><title>{}</title></head><body>{}</body></html>",
+ page.title,
+ render_nodes_to_string(ast.nodes)
+ );
+
+ if args.tar {
+ let mut header = Header::new_gnu();
+ header.set_size(html.as_bytes().len() as u64);
+ header.set_cksum();
+ archive
+ .append_data(&mut header, filename, html.as_bytes())
+ .unwrap();
+ } else {
+ let mut f = File::create(format!("out/{}", filename)).unwrap();
+ f.write_all(html.as_bytes()).unwrap()
+ }
+ } else {
+ eprintln!("page ignored: {:?}", page.title);
+ }
+ }
+ }
+ if Some(i) == args.limit {
+ break;
+ }
+ }
+ archive.finish().unwrap();
+}
+
+pub fn escape(text: &str) -> String {
+ text.replace("&", "&amp;")
+ .replace("<", "&lt;")
+ .replace(">", "&gt;")
+ .replace("'", "&#8217;")
+ .replace("\"", "&quot;")
+}
+
+fn render_nodes(html: &mut String, nodes: Vec<Node>) {
+ for n in nodes {
+ render_node(html, n)
+ }
+}
+fn render_nodes_to_string(nodes: Vec<Node>) -> String {
+ let mut html = String::new();
+ render_nodes(&mut html, nodes);
+ return html;
+}
+fn render_node(html: &mut String, n: Node) {
+ use std::fmt::Write;
+ match n {
+ parse_wiki_text::Node::Bold { .. } => (),
+ parse_wiki_text::Node::BoldItalic { .. } => (),
+ parse_wiki_text::Node::Category {
+ ordinal, target, ..
+ } => write!(html, "[todo]").unwrap(),
+ parse_wiki_text::Node::CharacterEntity { character, .. } => write!(html, "[todo]").unwrap(),
+ parse_wiki_text::Node::Comment { .. } => (),
+ parse_wiki_text::Node::DefinitionList { items, .. } => write!(html, "[todo]").unwrap(),
+ parse_wiki_text::Node::EndTag { name, .. } => write!(html, "[todo]").unwrap(),
+ parse_wiki_text::Node::ExternalLink { nodes, .. } => write!(html, "[todo]").unwrap(),
+ parse_wiki_text::Node::Heading { level, nodes, .. } => write!(
+ html,
+ "<h{level}>{}</h{level}>",
+ render_nodes_to_string(nodes)
+ )
+ .unwrap(),
+ parse_wiki_text::Node::HorizontalDivider { .. } => write!(html, "<hr>").unwrap(),
+ parse_wiki_text::Node::Image { target, text, .. } => write!(html, "[todo: image]").unwrap(),
+ parse_wiki_text::Node::Italic { .. } => (),
+ parse_wiki_text::Node::Link { target, text, .. } => write!(html, "[todo]").unwrap(),
+ parse_wiki_text::Node::MagicWord { .. } => write!(html, "[todo]").unwrap(),
+ parse_wiki_text::Node::OrderedList { items, .. } => write!(html, "[todo]").unwrap(),
+ parse_wiki_text::Node::ParagraphBreak { .. } => write!(html, "[todo]").unwrap(),
+ parse_wiki_text::Node::Parameter { default, name, .. } => write!(html, "[todo]").unwrap(),
+ parse_wiki_text::Node::Preformatted { nodes, .. } => {
+ write!(html, "<pre>{}</pre>", render_nodes_to_string(nodes)).unwrap()
+ }
+ parse_wiki_text::Node::Redirect { target, .. } => write!(html, "[todo]").unwrap(),
+ parse_wiki_text::Node::StartTag { name, .. } => write!(html, "[todo]").unwrap(),
+ parse_wiki_text::Node::Table {
+ attributes,
+ captions,
+ rows,
+ ..
+ } => write!(html, "[todo]").unwrap(),
+ parse_wiki_text::Node::Tag { name, nodes, .. } => write!(html, "[todo]").unwrap(),
+ parse_wiki_text::Node::Template {
+ name, parameters, ..
+ } => write!(html, "[todo]").unwrap(),
+ parse_wiki_text::Node::Text { value, .. } => write!(html, "{}", escape(value)).unwrap(),
+ parse_wiki_text::Node::UnorderedList { items, .. } => write!(html, "[todo]").unwrap(),
+ }
+}