C++ Convert Wikipedia XML Dump To Mysql (UTF-8 safe)


Here is a project for parsing the wikipedia xml database dump to mysql for use in mediawiki or whatever.

Wikipedia, the online encyclopedia, is also made available for download for offline and research use. To download the english wikipedia database dump, go to http://download.wikimedia.org/enwiki/. The download link i used was for pages-articles.xml.bz2 (4.1 GB). When you decompress it, it grows to a 19GB xml file.


Several challenges,
(1) the 19GB xml file is >4GB, therefore some 32 bit compilers have troubles reading a file this large. I used 'g++' in linux and had no issues.
(2) the xml file is encoded in UTF-8. As you can see in the xml parser below, I assume that xml tags and attributes are all valid ascii, which is the case in the enwiki-20081008-pages-articles.xml file. It is the nodevalues which can contain the international UTF-8 characters (ie: <page>nodevalue</page> ).
(3) No character corruption. (properly handle xml entities &quot; &lt; &gt; &amp; , then properly handle special characters in mysql).

Here is a table which shows how UTF-8 spreads the data for a single character across multiple bytes. UTF-8 was well designed, all ascii characters are also valid UTF-8 characters. More info at http://en.wikipedia.org/wiki/UTF-8.


bytesbytesrepresentation
170bbbbbbb
211110bbbbb 10bbbbbb
3161110bbbb 10bbbbbb 10bbbbbb
42111110bbb 10bbbbbb 10bbbbbb 10bbbbbb
Each b represents a bit that can be used to store character data.





First, create your mysql database.
mysql --host=localhost --username=myuser --password=mypass<create.sql

Second, compile and run main.cpp in the same directory as enwiki-20081008-pages-articles.xml. It will produce a 17GB file out.sql.

Finally, import into mysql:
mysql --host=localhost --username=myuser --password=mypass --database=wikipedia<out.sql

create.sql
CREATE DATABASE `wikipedia`;
USE `wikipedia`;
 
CREATE TABLE `wiki` (
  `id` bigint(20) UNSIGNED NOT NULL AUTO_INCREMENT,
  `title` text,
  `text` longtext,
  PRIMARY KEY  (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;

main.cpp
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <map>
#include <exception>
 
using namespace std;
 
int getnext_utf8_byte(ifstream &fd);
void write_utf8byte(ofstream &fd, const int &u);
void write_utf8byte_quoted(ofstream &fout, const int &u);
char get_next_byte(ifstream& fd);
ostream& operator << (ostream& os, vector<int> &str);
 
class XmlNode
{
public:
     string tagname;
     vector<int> data;//stores uft8 string as vector<int>
     map< string, string > attributes;
 
     void readtagname(ifstream &fd, int &a);
     void readattributes(ifstream &fd, int &a);
     void readtagclose(ifstream &fd, int &a);
     void readnodevalue(ifstream &fd, int &a);
};
 
//-----------------------------------------------------------------------------
int main(int argc, char* argv[])
{
    bool inpage=false;
 
    //for debugging
    cout.unsetf(ios::dec);
    cout.setf(ios::hex);
 
    ifstream fd("enwiki-20081008-pages-articles.xml");
    ofstream fout("out.sql");
 
    try{
 
        int a = getnext_utf8_byte(fd);
        while (fd.good())
        {
            if (a==0x3C)//'<'
            {
                XmlNode node;
                node.readtagname(fd, a);
                node.readattributes(fd, a);
                node.readtagclose(fd, a);
                node.readnodevalue(fd, a);
 
                if(node.tagname=="page")
                {
                    inpage=true;
                }
                else if(node.tagname=="/page")
                {
                    inpage=false;
                }
                else if(inpage && node.tagname=="title")
                {
                    fout << "insert into `wiki`(`title`,`text`) values('";
                    for(int i=0, x=node.data.size(); i<x; i++)
                        write_utf8byte_quoted(fout, node.data[i] );
                    fout << "',";
                }
                else if(inpage && node.tagname=="text")
                {
                    fout << "'";
                    for(int i=0, x=node.data.size(); i<x; i++)
                        write_utf8byte_quoted(fout, node.data[i] );
                    fout << "');"<<endl;                
                }
 
            }//if (a==0x3C)//'<'
 
        }//while (fd.good())
    }
    catch(exception &e)//end of file exception will be thrown
    {
        cout << e.what() <<endl;
    }
    fd.close();
    fout.close();
 
    return 0;
}
//-----------------------------------------------------------------------------
void XmlNode::readtagname(ifstream &fd, int &a)
{
    string tagname;
    a = getnext_utf8_byte(fd);
    while (a!=0x20 && a!=0x3E)//' '
    {
        tagname+= ( (0<=a && a<=127) ? (char)a : '?' );
        a = getnext_utf8_byte(fd);
    }
    this->tagname=tagname;
}
//-----------------------------------------------------------------------------
void XmlNode::readattributes(ifstream &fd, int &a)
{
    string token;
    string attribute_name;
    while (1)//start xml attributes
    {
        while (a==0x20) { a = getnext_utf8_byte(fd); }//eat whitespace
 
        while (a!=0x20 && a!=0x3D && a!=0x3E)//' ','=','>'
        {
            token+= ( (0<=a && a<=127) ? (char)a : '?' );
            a = getnext_utf8_byte(fd);
        }
 
        while (a==0x20) { a = getnext_utf8_byte(fd); }//eat whitespace
 
        if (a==0x3D)//'='
        {
            a = getnext_utf8_byte(fd);
 
            attribute_name=token; token.clear();
            while (a==0x20) { a = getnext_utf8_byte(fd); }//eat whitespace
 
            char delimiter = (char)a;
            if (delimiter==0x22 || delimiter==0x29)//'"','\''
            {
                a = getnext_utf8_byte(fd);
 
                while (a!=delimiter && a!=0x3E)//' ','=','>'
                {
                    token+= ( (0<=a && a<=127) ? (char)a : '?' );
                    a = getnext_utf8_byte(fd);
                }
                this->attributes[ attribute_name ] = token; token.clear();
            }
        }
        while (a==0x20) { a = getnext_utf8_byte(fd); }//eat whitespace
        if    (a==0x2f) { a = getnext_utf8_byte(fd); }//eat /
        if    (a==0x3E) { break;                     }//break on >
 
         a = getnext_utf8_byte(fd);
    }//while (1) //end xml attributes
}
//-----------------------------------------------------------------------------
void XmlNode::readtagclose(ifstream &fd, int &a)
{
    if    (a==0x3E)  { a = getnext_utf8_byte(fd);  }//eat >
}
//-----------------------------------------------------------------------------
void XmlNode::readnodevalue(ifstream &fd, int &a)
{
    while (a!=0x3C)//'<'  read nodevalue
    {
        if (a==0x26)//if &
        {
            string token;
            while (a!=0x3b)// eat until ;
            {
                token+= ( (0<=a && a<=127) ? (char)a : '?' );
                a = getnext_utf8_byte(fd);
            }
            token+= ( (0<=a && a<=127) ? (char)a : '?' );
            a = getnext_utf8_byte(fd);
            if (token=="\"") this->data.push_back((int)'"');
            if (token=="<")   this->data.push_back((int)'<');
            if (token=="&gt;")   this->data.push_back((int)'"&gt;');
            if (token=="&")  this->data.push_back((int)'&');
        }
        else//regular utf8 char (not ampersand entity)
        {
            this->data.push_back(a);
            a = getnext_utf8_byte(fd);
        }
    }
}
//-----------------------------------------------------------------------------
int getnext_utf8_byte(ifstream &fd)
{
    char arr[4];
 
    arr[0]=get_next_byte(fd);
    if ((arr[0]&0x80)==0)
        return arr[0]&0x7F;
 
    arr[1]=get_next_byte(fd);
    if ((arr[0]&0x20)==0 && (arr[1]&0x40)==0)
        return ((arr[0]&0x1F) << 6) | (arr[1]&0x3F);
 
    arr[2]=get_next_byte(fd);
    if ((arr[0]&0x10)==0 && (arr[1]&0x40)==0 && (arr[2]&0x40)==0)
        return ((arr[0]&0x0F) << 12) | ((arr[1]&0x3F) << 6) | (arr[2]&0x3F);
 
    arr[3]=get_next_byte(fd);
    if ((arr[0]&0x08)==0 && (arr[1]&0x40)==0 && (arr[2]&0x40)==0 && (arr[3]&0x40)==0)
    return ((arr[0]&0x07) << 18) | ((arr[1]&0x3F) << 12) | ((arr[2]&0x3F) << 6) | (arr[3]&0x3F);
 
    cerr << "Invalid UTF8 Character "<<endl;
    fd.putback(arr[3]);
    fd.putback(arr[2]);
    fd.putback(arr[1]);
    return 0x00;
}
//-----------------------------------------------------------------------------
char get_next_byte(ifstream& fd)
{
    char a;
    fd.get(a);
    if (!fd.good()) throw ios_base::failure("end of file");
    return a;
}
//-----------------------------------------------------------------------------
ostream& operator << (ostream& os, vector<int> &str)
{//for debugging
    for(int i=0; i<str.size(); i++)
    {
        if (32<= str[i] && str[i]<=127)
            os << (  (char)str[i]  );
        else
            os << "?";
    }
    return os;
}
//-----------------------------------------------------------------------------
void write_utf8byte_quoted(ofstream &fout, const int &u)
{
    if      (u==0x00) fout <<"\\x00";
    else if (u==0x0a) fout <<"\\n";
    else if (u==0x0d) fout <<"\\r";
    else if (u==0x5c) fout <<"\\\\";//"
    else if (u==0x27) fout <<"\\'";
    else if (u==0x22) fout <<"\\\"";
    else if (u==0x1a) fout <<"\\x1a";
    else        
        write_utf8byte(fout, u );
}
//-----------------------------------------------------------------------------
void write_utf8byte(ofstream &fd, const int &u)
{
    char bytes[4];
 
    int bytecount=1;
    if (u>= 65536)//4 : 2^16
    {                              
        bytes[0]= 0x000000F0 | ((0x001C0000 & u) >> 18);
        bytes[1]= 0x00000080 | ((0x0003F000 & u) >> 12);
        bytes[2]= 0x00000080 | ((0x00000FC0 & u) >> 6);
        bytes[3]= 0x00000080 | ((0x0000003F & u) >> 0);
        bytecount=4;
    }
    else if (u>=2048)//3 : 2^11
    {
        bytes[0]= 0x000000E0 | ((0x0000F000 & u) >> 12);
        bytes[1]= 0x00000080 | ((0x00000FC0 & u) >> 6);
        bytes[2]= 0x00000080 | ((0x0000003F & u) >> 0);
        bytecount=3;
    }
    else if (u>=128)//2 : 2^7
    {
        bytes[0]= 0x000000C0 | ((0x000007C0 & u) >> 6);
        bytes[1]= 0x00000080 | ((0x0000003F & u) >> 0);
        bytecount=2;
    }
    else //1
    {
        bytes[0]= 0x0000007F & u;
        bytecount=1;
    }
    fd.write(bytes,bytecount);
}
//-----------------------------------------------------------------------------
code snippets are licensed under Creative Commons CC-By-SA 3.0 (unless otherwise specified)