Benutzer:Dealerofsalvation/Bots/BKLCheck/Sourcecode

aus Wikipedia, der freien Enzyklopädie
Zur Navigation springen Zur Suche springen
import static javax.xml.stream.XMLStreamConstants.END_ELEMENT;
import static javax.xml.stream.XMLStreamConstants.START_ELEMENT;

import java.beans.XMLDecoder;
import java.beans.XMLEncoder;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

public class BKLCheckBot {

	private static final File STATE_FILE = new File("state.xml");

	private static final Logger logger = Logger.getLogger("bklCheckBot");

	public static void main(String[] args) throws IOException,
			XMLStreamException {
		final BKLCheckBot bot;
		if (STATE_FILE.exists()) {
			bot = readState();
		} else if (0 == args.length) {
			throw new IllegalStateException(
					"Beim ersten Start bitte User-Agent als Parameter angeben, z. B. E-Mail-Adresse. Siehe https://www.mediawiki.org/wiki/API#Identifying_your_client");
		} else {
			String userAgent = args[0];
			bot = new BKLCheckBot();
			bot.userAgent = userAgent;
		}
		Runtime.getRuntime().addShutdownHook(new Thread() {
			@Override
			public void run() {
				try {
					bot.writeState();
					bot.writeWikitext();
				} catch (IOException e) {
					logger.log(Level.WARNING, "writeState", e);
				}
			}
		});
		bot.run();
	}

	private void writeWikitext() throws IOException {
		try (Writer writer = new OutputStreamWriter(new FileOutputStream("out.wiki"))) {
			for (String s : vollroteBKLs) {
				writer.write("[[" + s + "]]\n");
			}
		}
	}

	private String cmContinue;

	private transient String nextCMContinue;

	private Set<String> vollroteBKLs = new LinkedHashSet<>();

	private String userAgent;

	private XMLInputFactory factory = XMLInputFactory.newInstance();

	private void run() throws IOException, XMLStreamException {
		List<String> bkls;
		while ((bkls = getBKLs()) != null) {
			for (String bkl : bkls) {
				if (istVollrot(bkl)) {
					logger.info("Vollrot: " + bkl);
					vollroteBKLs.add(bkl);
				}
			}
			cmContinue = nextCMContinue;
		}
	}

	private boolean istVollrot(String bkl) throws IOException,
			XMLStreamException {
		String spec = "http://de.wikipedia.org/w/api.php?format=xml&action=query&generator=links&gplnamespace=0&gpllimit=max&titles="
				+ URLEncoder.encode(bkl, "UTF-8");
		try (InputStream stream = getStream(spec);) {
			XMLStreamReader reader = factory.createXMLStreamReader(stream);
			reader.nextTag();
			assert "api".equals(reader.getLocalName());
			do {
				reader.nextTag();
			} while (!"page".equals(reader.getLocalName()));
			do {
				assert START_ELEMENT == reader.getEventType();
				String value = reader.getAttributeValue(null, "missing");
				if (null == value) {
					return false; 
				}
				reader.nextTag();
				assert END_ELEMENT == reader.getEventType();
				reader.nextTag();
			} while ("page".equals(reader.getLocalName()));
			return true;
		}
		//
		// <?xml version="1.0"?>
		// <api>
		// <limits links="500" />
		// <query>
		// <pages>
		// <page ns="0" title="Francis de Quervain" missing="" />
		// <page ns="0" title="Marcel Roland de Quervain" missing="" />
		// <page pageid="3046608" ns="0"
		// title="Alfred de Quervain (Geophysiker)" />
		// <page pageid="941244" ns="0" title="Alfred de Quervain (Theologe)" />
		// <page pageid="1563710" ns="0" title="Fritz de Quervain" />
		// </pages>
		// </query>
		// </api>
	}

	private List<String> getBKLs() throws IOException, XMLStreamException {
		String spec = "http://de.wikipedia.org/w/api.php?format=xml&action=query&list=categorymembers&cmnamespace=0&cmprop=title&cmtitle=Kategorie:Begriffskl%C3%A4rung";
		if (null != cmContinue) {
			spec = spec + "&cmcontinue=" + cmContinue;
		}
		List<String> result = new ArrayList<>();
		try (InputStream stream = getStream(spec);) {
			XMLStreamReader reader = factory.createXMLStreamReader(stream);
			reader.nextTag();
			assert "api".equals(reader.getLocalName());
			reader.nextTag();
			if ("query-continue".equals(reader.getLocalName())) {
				reader.nextTag();
				assert "categorymembers"
						.equals(reader.getLocalName());
				nextCMContinue = reader.getAttributeValue(null, "cmcontinue");
				reader.nextTag();
				assert "categorymembers"
						.equals(reader.getLocalName());
				reader.nextTag();
				assert "query-continue".equals(reader.getLocalName());
				reader.nextTag();
			}
			assert "query".equals(reader.getLocalName());
			reader.nextTag();
			assert "categorymembers".equals(reader.getLocalName());
			reader.nextTag();
			while ("cm".equals(reader.getLocalName())) {
				String title = reader.getAttributeValue(null, "title");
				result.add(title);
				reader.nextTag();
				assert "cm".equals(reader.getLocalName());
				reader.nextTag();
			}
		}
		// <api>
		// <query-continue>
		// <categorymembers
		// cmcontinue="page|233a31204c4947410a312e204c494741|1536594"/>
		// </query-continue>
		// <query>
		// <categorymembers>
		// <cm ns="0" title="1. Dalai Lama"/>
		// <cm ns="0" title="1 Decembrie"/>
		// <cm ns="0" title="1. Deild"/>
		// <cm ns="0" title="1. Division"/>
		// <cm ns="0" title="1. FC Union"/>
		// <cm ns="0" title="1. FCK"/>
		// <cm ns="0" title="1. FFC"/>
		// <cm ns="0" title="1. Kavallerie-Division"/>
		// <cm ns="0" title="1. Klavierkonzert"/>
		// <cm ns="0" title="I. Korps"/>
		// </categorymembers>
		// </query>
		// </api>
		return result;
	}

	private GZIPInputStream getStream(String spec)
			throws MalformedURLException, IOException {
		URL url = new URL(spec);
		URLConnection connection = url.openConnection();
		connection.setRequestProperty("Accept-encoding", "gzip");
		connection.setRequestProperty("User-Agent", userAgent);
		connection.connect();
		logger.info(spec);
		return new GZIPInputStream(connection.getInputStream());
	}

	private static BKLCheckBot readState() throws FileNotFoundException {
		try (XMLDecoder decoder = new XMLDecoder(
				new FileInputStream(STATE_FILE))) {
			return (BKLCheckBot) decoder.readObject();
		}
	}

	private void writeState() throws IOException {
		try (XMLEncoder encoder = new XMLEncoder(new FileOutputStream(
				STATE_FILE))) {
			encoder.writeObject(this);
		}
	}

	public String getCmContinue() {
		return cmContinue;
	}

	public void setCmContinue(String s) {
		cmContinue = s;
	}

	public String getUserAgent() {
		return userAgent;
	}

	public void setUserAgent(String s) {
		userAgent = s;
	}

	public Set<String> getVollroteBKLs() {
		return new LinkedHashSet<>(vollroteBKLs);
	}

	public void setVollroteBKLs(Set<String> set) {
		vollroteBKLs = new LinkedHashSet<>(set);
	}

}