marytts.tools.dbselection
Class WikipediaProcessor

java.lang.Object
  extended by marytts.tools.dbselection.WikipediaProcessor

public class WikipediaProcessor
extends java.lang.Object

WikipediaProcessor This program processes one by one the xml files split with wikipediaDumpSplitter. Each xml file is converted to an sql source file with mwdumper-2008-04-13.jar (org.mediawiki.dumper.Dumper) The tables names in the sql source are prefixed with the local (ex. en_US, de etc.) Each sql source is loaded in a mysql database, basically the tables local_text, local_page and local_revision are loaded. Once the tables are loaded the WikipediMarkupCleaner is used to extract clean text and a wordList, as a result two tables will be created in the database: local_cleanText and local_wordList (the wordList is also saved in a file).

Author:
Marcela Charfuelan.

Constructor Summary
WikipediaProcessor()
           
 
Method Summary
 boolean getDebug()
           
 boolean getDeleteCleanTextTable()
           
 java.lang.String getListFile()
           
 boolean getLoadWikiTables()
           
 java.lang.String getLocale()
           
 int getMaxTextLength()
           
 int getMinPageLength()
           
 int getMinTextLength()
           
 java.lang.String getMysqlDB()
           
 java.lang.String getMysqlHost()
           
 java.lang.String getMysqlPasswd()
           
 java.lang.String getMysqlUser()
           
 java.lang.String getPageFile()
           
 java.lang.String getRevisionFile()
           
 java.lang.String getTestId()
           
 java.lang.String getTextFile()
           
 java.lang.String getWikiLog()
           
static void main(java.lang.String[] args)
           
 void setDebug(boolean bval)
           
 void setDeleteCleanTextTable(boolean bval)
           
 void setListFile(java.lang.String str)
           
 void setLoadWikiTables(boolean bval)
           
 void setLocale(java.lang.String str)
           
 void setMaxTextLength(int val)
           
 void setMinPageLength(int val)
           
 void setMinTextLength(int val)
           
 void setMysqlDB(java.lang.String str)
           
 void setMysqlHost(java.lang.String str)
           
 void setMysqlPasswd(java.lang.String str)
           
 void setMysqlUser(java.lang.String str)
           
 void setPageFile(java.lang.String str)
           
 void setRevisionFile(java.lang.String str)
           
 void setTestId(java.lang.String str)
           
 void setTextFile(java.lang.String str)
           
 void setWikiLog(java.lang.String str)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Constructor Detail

WikipediaProcessor

public WikipediaProcessor()
Method Detail

setLocale

public void setLocale(java.lang.String str)

setMysqlHost

public void setMysqlHost(java.lang.String str)

setMysqlDB

public void setMysqlDB(java.lang.String str)

setMysqlUser

public void setMysqlUser(java.lang.String str)

setMysqlPasswd

public void setMysqlPasswd(java.lang.String str)

setListFile

public void setListFile(java.lang.String str)

setTextFile

public void setTextFile(java.lang.String str)

setPageFile

public void setPageFile(java.lang.String str)

setRevisionFile

public void setRevisionFile(java.lang.String str)

setWikiLog

public void setWikiLog(java.lang.String str)

setTestId

public void setTestId(java.lang.String str)

setMinPageLength

public void setMinPageLength(int val)

setMinTextLength

public void setMinTextLength(int val)

setMaxTextLength

public void setMaxTextLength(int val)

setDebug

public void setDebug(boolean bval)

setLoadWikiTables

public void setLoadWikiTables(boolean bval)

setDeleteCleanTextTable

public void setDeleteCleanTextTable(boolean bval)

getLocale

public java.lang.String getLocale()

getMysqlHost

public java.lang.String getMysqlHost()

getMysqlDB

public java.lang.String getMysqlDB()

getMysqlUser

public java.lang.String getMysqlUser()

getMysqlPasswd

public java.lang.String getMysqlPasswd()

getListFile

public java.lang.String getListFile()

getTextFile

public java.lang.String getTextFile()

getPageFile

public java.lang.String getPageFile()

getRevisionFile

public java.lang.String getRevisionFile()

getWikiLog

public java.lang.String getWikiLog()

getTestId

public java.lang.String getTestId()

getMinPageLength

public int getMinPageLength()

getMinTextLength

public int getMinTextLength()

getMaxTextLength

public int getMaxTextLength()

getDebug

public boolean getDebug()

getLoadWikiTables

public boolean getLoadWikiTables()

getDeleteCleanTextTable

public boolean getDeleteCleanTextTable()

main

public static void main(java.lang.String[] args)
                 throws java.lang.Exception
Throws:
java.lang.Exception