User:Hendrik Brummermann/XHTMLDumper.java
Appearance
//
// This work is licensed under CC-BY // (Creative Commons License - Attribution 2.0). // see: http://creativecommons.org/licenses/by/2.0/ // You need the program "tidy" in your system's search path. /* * $Log: XHTMLDumper.java,v $ * Revision 1.8 2005/01/08 12:01:30 nhb * Fixing invokation of wget * * Revision 1.7 2004/12/11 18:08:28 nhb * Store output of tidy into a file instead of reading it directly from stdout. * Do not depand on node.toString() dumping the whole xml tree. * * Revision 1.6 2004/09/28 19:50:46 nhb * Bugfix: Doppeltes head-Element beseitigt und im inline Stylesheet die linke Spalte auf 0 gesetzt * * Revision 1.5 2004/09/28 19:09:55 nhb * - Skriptbasiertes Herunterladen von allen Links auf einer Seite. * - Der Head-Bereich wird ausgetauscht. * - Bild-URLs werden entsprechend umgeschrieben. * - Ausgabe eines wget-Skripts fuer Bilder * - keine Umkodierung von UTF-8 nach ISO-8859-1 mehr. * * Revision 1.4 2004/08/29 18:12:50 nhb * Neue Klasse: Book * * Revision 1.3 2004/08/28 21:27:26 nhb * *** empty log message *** * * Revision 1.2 2004/08/28 08:16:09 nhb * Refectoring * * Revision 1.1 2004/08/23 22:16:56 nhb * inital checkin * */ package nhb.wikipedia; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.StringWriter; import java.net.URL; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.apache.xpath.XPathAPI; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.traversal.NodeIterator; import org.xml.sax.SAXException; /** * fetches a collection of articles to disk. * It can rewrite links and remove the MediaWiki navigation. * Several articles can be combined to one file. * * @author Hendrik Brummermann <nhb_web@nexgo.de> * @link http://creativecommons.org/licenses/by/2.0/ */ public class XHTMLDumper { // --> --> --> --> --> --> --> --> --> --> --> --> --> --> --> private static final String URL_PREFIX = "http://localhost:10080"; //"http://wiki"; private static final String WIKI_PATH = "/mediawiki/index.php/"; private static final String UPLOAD_PATH = "/mediawiki/images"; private static final String TARGET = "/tmp/wiki"; private static final String IMAGE_FOLDER = "wiki_files"; private static final String ID_SEP = "_____"; // <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- public XHTMLDumper() { } public class Article { // Variablen // private final Namespace NS_HTML = Namespace.getNamespace("http://www.w3.org/1999/xhtml"); private String title = null; private String url = null; private Element root = null; private Element content = null; private boolean unifyIDs = false; private boolean convertShortTags = true; private boolean fetchPageRequisites = true; private boolean rewriteLocalURLs = true; private Set pageRequisites = new HashSet(); private String head = "<head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\r\n" + "<link rel=\"shortcut icon\" href=\"wiki_files/favicon.ico\" />\r\n" + "<style type=\"text/css\" media=\"screen,projection\">/*<![CDATA[*/ @import \"wiki_files/main.css\"; /*]]>*/#content {margin: 0}</style>\r\n" + "<link rel=\"stylesheet\" type=\"text/css\" media=\"print\" href=\"wiki_files/commonPrint.css\" />\r\n" + "<script type=\"text/javascript\" src=\"wiki_files/wikibits.js\"> </script>\r\n" + "<title>HISLSF - Dokumentation</title>\r\n" + "</head>"; // <title>DB-Interface-Admin - His</title> public Article(String title) { this.title = title; url = URL_PREFIX + WIKI_PATH + title; fetchAsXHTML(); } public void process() { extractContent(); unifyIDsAndConvertShortTags(root); } /** * Stores the file to disk. * * @throws IOException bei einem E/A-Fehler * @throws ParserConfigurationException Konfigurationsfehler des XML-Parsers * @throws SAXException bei einem XML-Fehler */ public void saveToDisk() throws SAXException, IOException, ParserConfigurationException { process(); replaceHead(); // fetchPageRequisites(); String filename = TARGET + "/" + title.replace(' ', '_').replace('/', '-') + ".html"; OutputStream of = new FileOutputStream(filename); of.write(XMLUtils.dumpXML(root).getBytes("UTF-8")); of.close(); } /** * replaces the head-element <!-- preserving the title-element.--> * * @throws IOException bei einem E/A-Fehler * @throws ParserConfigurationException Konfigurationsfehler des XML-Parsers * @throws SAXException bei einem XML-Fehler */ private void replaceHead() throws SAXException, IOException, ParserConfigurationException { Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new ByteArrayInputStream(head.getBytes())); Node oldHead = null; for (int i = 0; i < root.getChildNodes().getLength(); i++) { oldHead = root.getChildNodes().item(i); if ((oldHead instanceof Element) && ((Element) oldHead).getNodeName().equalsIgnoreCase("head")) { break; } } Node newHead = doc.getDocumentElement(); newHead = root.getOwnerDocument().importNode(newHead, true); root.insertBefore(newHead, oldHead); root.removeChild(oldHead); } /** * Downloads a HTML-document, converts it into xhtml using tidy * and parses it into an xml object tree. */ private void fetchAsXHTML() { try { // fetch String file = NetUtil.fetchDocumentAsFile(url); // run tidy //"tidy -asxhtml -utf8 $1 >$1.html 2> /dev/null" // Process process = Runtime.getRuntime().exec("tidy -q -asxhtml -utf8 " + file); String outFile = File.createTempFile("xhtml", ".html").getAbsolutePath(); Process process = Runtime.getRuntime().exec("tidy -q -asxhtml -utf8 -o " + outFile + " "+ file); /* System.out.println("sleeping"); Thread.sleep(5000); System.out.println("sleeped");*/ process.waitFor(); //Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(process.getInputStream()); Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new FileInputStream(outFile)); root = doc.getDocumentElement(); } catch (Exception e) { e.printStackTrace(); } } /** * Extracts the content (i. e. strips the navigation). */ private void extractContent() { try { content = (Element) XPathAPI.selectSingleNode(root, "//div[@id='content']"); // Einige Elemente loeschen // XMLUtils.removeChildren(root, "//self::node()[@id='contentSub' or @id='siteSub' or @id='toc' or @class='printfooter' or @id='catlinks' or @class='editsection']"); XMLUtils.removeChildren(root, "//self::node()[@id='column-one' or @id='footer' or @id='contentSub' or @id='siteSub' or @class='printfooter' or @id='catlinks' or @class='editsection']"); } catch (TransformerException e) { e.printStackTrace(); } } /* name: top id: contentTop id: bodyContent id: contentSub */ private void unifyIDsAndConvertShortTags(Element element) { NodeList nodes = element.getChildNodes(); for (int i = 0; i < nodes.getLength(); i++) { Node node = nodes.item(i); if (node instanceof Element) { Element e = (Element) node; String nodeName = node.getNodeName(); // unify IDs if (unifyIDs && "a".equals(nodeName)) { String val = e.getAttribute("name"); if (!val.equals("")) { e.setAttribute("name", title + ID_SEP + val); e.setAttribute("id", title + ID_SEP + val); } val = e.getAttribute("href"); if ((val.length() > 1) && val.charAt(0) == '#') { e.setAttribute("href", title + ID_SEP + val.substring(1)); System.out.println("#" + title + ID_SEP + val.substring(1)); } } // convert short tags if (convertShortTags && ("a".equals(nodeName) || "div".equals(nodeName)) && (e.getFirstChild() == null)) { e.appendChild(e.getOwnerDocument().createTextNode("")); } // convert links to other pages if (rewriteLocalURLs && "a".equals(nodeName)) { String val = e.getAttribute("href"); if (val.startsWith(WIKI_PATH)) { e.setAttribute("href", val.substring(WIKI_PATH.length())); } } // collection image urls and rewrite img-src links. if (fetchPageRequisites && "img".equals(nodeName)) { String url = e.getAttribute("src"); pageRequisites.add(url); if (url.startsWith(UPLOAD_PATH)) { url = IMAGE_FOLDER + url.substring(UPLOAD_PATH.length() + 5); e.setAttribute("src", url); } } // go to the next level unifyIDsAndConvertShortTags((Element) node); } } } /** * Returns a set of page requisites (like images) * * @return Set */ public Set getPageRequisites() { return pageRequisites; } /** * Return the xml object. * * @return Element */ public Element getXML() { if (content != null) { return content; } else { return root; } } } public class Book { private Set pages = new HashSet(); private Set pageRequisites = new HashSet(); /** * creates a new book * * @param name page containing a list of links * @throws IOException */ public Book(String name) throws IOException { // fetch wiki text BufferedReader br = NetUtil.fetchDocumentAsBufferedReader(URL_PREFIX + WIKI_PATH + name + "?action=raw"); fetchLinkList(br); br.close(); } /** * Fetches all pages of this book * @throws IOException * @throws ParserConfigurationException * @throws SAXException */ public void fetchBook() throws SAXException, IOException, ParserConfigurationException { Iterator itr = pages.iterator(); while (itr.hasNext()) { String page = (String) itr.next(); System.out.println("fetching " + page + "..."); Article article = new Article(page); article.saveToDisk(); pageRequisites.addAll(article.getPageRequisites()); } fetchPageRequisites(); } private void fetchPageRequisites() { System.out.println("cd " + TARGET + "/" + IMAGE_FOLDER); Iterator itr = pageRequisites.iterator(); while (itr.hasNext()) { System.out.println("wget -N " + URL_PREFIX + itr.next()); } } private void fetchLinkList(BufferedReader br) throws IOException { String line = br.readLine(); while (line != null) { int pos = line.indexOf("[["); while (pos > -1) { line = line.substring(pos + 2); int posEnd = line.indexOf("]]"); if (posEnd == -1) { // is the link closed? break; } String link = line.substring(0, posEnd); pos = link.indexOf("|"); if (pos > -1) { link = link.substring(0, pos); } link = link.trim(); String page = link; if (page.length() == 0) { continue; } page = page.replace(' ', '_'); pages.add(page); // find next link line = line.substring(posEnd + 2); pos = line.indexOf("[["); } line = br.readLine(); } } } public class Cover { private Element root = null; public Cover(String name) { Article cover = new Article(name); root = cover.getXML(); XMLUtils.removeChildren(root, "//div[@id='content']/*"); XMLUtils.removeChildren(root, "//div[@id='column-one' or @id='footer']"); try { Element content = (Element) XPathAPI.selectSingleNode(root, "//div[@id='content']"); } catch (TransformerException e) { e.printStackTrace(); } } public Element getXML() { return root; } } public static class XMLUtils { /** hide constructor */ private XMLUtils() { } /** * Dumps an XML-tree into a String * * @param node xml-node * @return String */ public static String dumpXML(Node node) { try { // Message-ID: <gX009.6306$Ky3.363117@newsread2.prod.itd.earthlink.net> From: "Billy Ng" DOMSource source = new DOMSource(node); TransformerFactory tfFactory = TransformerFactory.newInstance(); Transformer transformer = tfFactory.newTransformer(); StringWriter sw = new StringWriter(); StreamResult result = new StreamResult(sw); transformer.transform(source, result); return sw.toString(); } catch (TransformerConfigurationException e) { e.printStackTrace(); } catch (TransformerException e) { e.printStackTrace(); } return ""; } public static void removeChildren(Element parent, String xpath) { try { NodeIterator itr = XPathAPI.selectNodeIterator(parent, xpath); Node node = itr.nextNode(); Set set = new HashSet(); while (node != null) { set.add(node); node = itr.nextNode(); } Iterator itr2 = set.iterator(); while (itr2.hasNext()) { node = (Node) itr2.next(); node.getParentNode().removeChild(node); } } catch (TransformerException e) { e.printStackTrace(); } } } /** * Utility class for network access. */ public static final class NetUtil { private static final int BUFFER_SIZE = 10240; /** Hide constructor */ private NetUtil() { } /** * Gibt einen BufferedReader mit dem Ziel der URL zurueck. * * @param urlString URL * @return BufferedReader * @throws IOException bei einem E/A-Fehler */ public static BufferedReader fetchDocumentAsBufferedReader(String urlString) throws IOException { URL url = new URL(urlString); InputStream is = url.openStream(); return new BufferedReader(new InputStreamReader(is)); } /** * Laedt ein Dokument aus dem Netz herunter und * speichert es in einer lokalen Datei. * * @param urlString URL * @return Dateiname * @throws IOException bei einem Fehler */ public static String fetchDocumentAsFile(String urlString) throws IOException { byte[] temp = new byte[BUFFER_SIZE + 1]; URL url = new URL(urlString); BufferedInputStream is = new BufferedInputStream(url.openStream()); File file = File.createTempFile("dump", ".html"); file.deleteOnExit(); String tempFile = file.getAbsolutePath(); BufferedOutputStream os = new BufferedOutputStream(new FileOutputStream(tempFile)); while (true) { int aval = is.available(); if (aval == 0) { try { Thread.sleep(100); } catch (InterruptedException e) { e.printStackTrace(); } } int count = is.read(temp, 0, BUFFER_SIZE); if (count == -1) { break; } os.write(temp, 0, count); } is.close(); os.close(); return tempFile; } } /** * main entry point * * @param args command line arguments * @throws Exception if something unexpected happend */ public static void main(String[] args) throws Exception { if (args.length == 0) { System.err.println("Aufruf: nhb.wikipedia.XHTMLDumper title-of-link-list"); System.err.println(" title-of-link-list is the title of the page containing a list of links."); System.exit(1); } XHTMLDumper xd = new XHTMLDumper(); Book book = xd.new Book(args[0]); book.fetchBook(); /*article.process(); System.out.println(article.getXML());*/ System.out.println("Fertig."); } } //