C++ Convert Wikipedia XML Dump To Mysql (UTF-8 safe)
Here is a project for parsing the wikipedia xml database dump to mysql for use in mediawiki or whatever.
Wikipedia, the online encyclopedia, is also made available for download for offline and research use. To download the english wikipedia database dump, go to http://download.wikimedia.org/enwiki/. The download link i used was for pages-articles.xml.bz2 (4.1 GB). When you decompress it, it grows to a 19GB xml file.
Several challenges,
(1) the 19GB xml file is >4GB, therefore some 32 bit compilers have troubles reading a file this large. I used 'g++' in linux and had no issues.
(2) the xml file is encoded in UTF-8. As you can see in the xml parser below, I assume that xml tags and attributes are all valid ascii, which is the case in the enwiki-20081008-pages-articles.xml file. It is the nodevalues which can contain the international UTF-8 characters (ie: <page>nodevalue</page> ).
(3) No character corruption. (properly handle xml entities " < > & , then properly handle special characters in mysql).
Here is a table which shows how UTF-8 spreads the data for a single character across multiple bytes. UTF-8 was well designed, all ascii characters are also valid UTF-8 characters. More info at http://en.wikipedia.org/wiki/UTF-8.
bytes | bytes | representation |
1 | 7 | 0bbbbbbb |
2 | 11 | 110bbbbb 10bbbbbb |
3 | 16 | 1110bbbb 10bbbbbb 10bbbbbb |
4 | 21 | 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb |
First, create your mysql database.
mysql --host=localhost --username=myuser --password=mypass<create.sql
Second, compile and run main.cpp in the same directory as enwiki-20081008-pages-articles.xml. It will produce a 17GB file out.sql.
Finally, import into mysql:
mysql --host=localhost --username=myuser --password=mypass --database=wikipedia<out.sql
create.sql
CREATE DATABASE `wikipedia`; USE `wikipedia`; CREATE TABLE `wiki` ( `id` bigint(20) UNSIGNED NOT NULL AUTO_INCREMENT, `title` text, `text` longtext, PRIMARY KEY (`id`) ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
main.cpp
#include <iostream> #include <fstream> #include <string> #include <vector> #include <map> #include <exception> using namespace std; int getnext_utf8_byte(ifstream &fd); void write_utf8byte(ofstream &fd, const int &u); void write_utf8byte_quoted(ofstream &fout, const int &u); char get_next_byte(ifstream& fd); ostream& operator << (ostream& os, vector<int> &str); class XmlNode { public: string tagname; vector<int> data;//stores uft8 string as vector<int> map< string, string > attributes; void readtagname(ifstream &fd, int &a); void readattributes(ifstream &fd, int &a); void readtagclose(ifstream &fd, int &a); void readnodevalue(ifstream &fd, int &a); }; //----------------------------------------------------------------------------- int main(int argc, char* argv[]) { bool inpage=false; //for debugging cout.unsetf(ios::dec); cout.setf(ios::hex); ifstream fd("enwiki-20081008-pages-articles.xml"); ofstream fout("out.sql"); try{ int a = getnext_utf8_byte(fd); while (fd.good()) { if (a==0x3C)//'<' { XmlNode node; node.readtagname(fd, a); node.readattributes(fd, a); node.readtagclose(fd, a); node.readnodevalue(fd, a); if(node.tagname=="page") { inpage=true; } else if(node.tagname=="/page") { inpage=false; } else if(inpage && node.tagname=="title") { fout << "insert into `wiki`(`title`,`text`) values('"; for(int i=0, x=node.data.size(); i<x; i++) write_utf8byte_quoted(fout, node.data[i] ); fout << "',"; } else if(inpage && node.tagname=="text") { fout << "'"; for(int i=0, x=node.data.size(); i<x; i++) write_utf8byte_quoted(fout, node.data[i] ); fout << "');"<<endl; } }//if (a==0x3C)//'<' }//while (fd.good()) } catch(exception &e)//end of file exception will be thrown { cout << e.what() <<endl; } fd.close(); fout.close(); return 0; } //----------------------------------------------------------------------------- void XmlNode::readtagname(ifstream &fd, int &a) { string tagname; a = getnext_utf8_byte(fd); while (a!=0x20 && a!=0x3E)//' ' { tagname+= ( (0<=a && a<=127) ? (char)a : '?' ); a = getnext_utf8_byte(fd); } this->tagname=tagname; } //----------------------------------------------------------------------------- void XmlNode::readattributes(ifstream &fd, int &a) { string token; string attribute_name; while (1)//start xml attributes { while (a==0x20) { a = getnext_utf8_byte(fd); }//eat whitespace while (a!=0x20 && a!=0x3D && a!=0x3E)//' ','=','>' { token+= ( (0<=a && a<=127) ? (char)a : '?' ); a = getnext_utf8_byte(fd); } while (a==0x20) { a = getnext_utf8_byte(fd); }//eat whitespace if (a==0x3D)//'=' { a = getnext_utf8_byte(fd); attribute_name=token; token.clear(); while (a==0x20) { a = getnext_utf8_byte(fd); }//eat whitespace char delimiter = (char)a; if (delimiter==0x22 || delimiter==0x29)//'"','\'' { a = getnext_utf8_byte(fd); while (a!=delimiter && a!=0x3E)//' ','=','>' { token+= ( (0<=a && a<=127) ? (char)a : '?' ); a = getnext_utf8_byte(fd); } this->attributes[ attribute_name ] = token; token.clear(); } } while (a==0x20) { a = getnext_utf8_byte(fd); }//eat whitespace if (a==0x2f) { a = getnext_utf8_byte(fd); }//eat / if (a==0x3E) { break; }//break on > a = getnext_utf8_byte(fd); }//while (1) //end xml attributes } //----------------------------------------------------------------------------- void XmlNode::readtagclose(ifstream &fd, int &a) { if (a==0x3E) { a = getnext_utf8_byte(fd); }//eat > } //----------------------------------------------------------------------------- void XmlNode::readnodevalue(ifstream &fd, int &a) { while (a!=0x3C)//'<' read nodevalue { if (a==0x26)//if & { string token; while (a!=0x3b)// eat until ; { token+= ( (0<=a && a<=127) ? (char)a : '?' ); a = getnext_utf8_byte(fd); } token+= ( (0<=a && a<=127) ? (char)a : '?' ); a = getnext_utf8_byte(fd); if (token=="\"") this->data.push_back((int)'"'); if (token=="<") this->data.push_back((int)'<'); if (token==">") this->data.push_back((int)'">'); if (token=="&") this->data.push_back((int)'&'); } else//regular utf8 char (not ampersand entity) { this->data.push_back(a); a = getnext_utf8_byte(fd); } } } //----------------------------------------------------------------------------- int getnext_utf8_byte(ifstream &fd) { char arr[4]; arr[0]=get_next_byte(fd); if ((arr[0]&0x80)==0) return arr[0]&0x7F; arr[1]=get_next_byte(fd); if ((arr[0]&0x20)==0 && (arr[1]&0x40)==0) return ((arr[0]&0x1F) << 6) | (arr[1]&0x3F); arr[2]=get_next_byte(fd); if ((arr[0]&0x10)==0 && (arr[1]&0x40)==0 && (arr[2]&0x40)==0) return ((arr[0]&0x0F) << 12) | ((arr[1]&0x3F) << 6) | (arr[2]&0x3F); arr[3]=get_next_byte(fd); if ((arr[0]&0x08)==0 && (arr[1]&0x40)==0 && (arr[2]&0x40)==0 && (arr[3]&0x40)==0) return ((arr[0]&0x07) << 18) | ((arr[1]&0x3F) << 12) | ((arr[2]&0x3F) << 6) | (arr[3]&0x3F); cerr << "Invalid UTF8 Character "<<endl; fd.putback(arr[3]); fd.putback(arr[2]); fd.putback(arr[1]); return 0x00; } //----------------------------------------------------------------------------- char get_next_byte(ifstream& fd) { char a; fd.get(a); if (!fd.good()) throw ios_base::failure("end of file"); return a; } //----------------------------------------------------------------------------- ostream& operator << (ostream& os, vector<int> &str) {//for debugging for(int i=0; i<str.size(); i++) { if (32<= str[i] && str[i]<=127) os << ( (char)str[i] ); else os << "?"; } return os; } //----------------------------------------------------------------------------- void write_utf8byte_quoted(ofstream &fout, const int &u) { if (u==0x00) fout <<"\\x00"; else if (u==0x0a) fout <<"\\n"; else if (u==0x0d) fout <<"\\r"; else if (u==0x5c) fout <<"\\\\";//" else if (u==0x27) fout <<"\\'"; else if (u==0x22) fout <<"\\\""; else if (u==0x1a) fout <<"\\x1a"; else write_utf8byte(fout, u ); } //----------------------------------------------------------------------------- void write_utf8byte(ofstream &fd, const int &u) { char bytes[4]; int bytecount=1; if (u>= 65536)//4 : 2^16 { bytes[0]= 0x000000F0 | ((0x001C0000 & u) >> 18); bytes[1]= 0x00000080 | ((0x0003F000 & u) >> 12); bytes[2]= 0x00000080 | ((0x00000FC0 & u) >> 6); bytes[3]= 0x00000080 | ((0x0000003F & u) >> 0); bytecount=4; } else if (u>=2048)//3 : 2^11 { bytes[0]= 0x000000E0 | ((0x0000F000 & u) >> 12); bytes[1]= 0x00000080 | ((0x00000FC0 & u) >> 6); bytes[2]= 0x00000080 | ((0x0000003F & u) >> 0); bytecount=3; } else if (u>=128)//2 : 2^7 { bytes[0]= 0x000000C0 | ((0x000007C0 & u) >> 6); bytes[1]= 0x00000080 | ((0x0000003F & u) >> 0); bytecount=2; } else //1 { bytes[0]= 0x0000007F & u; bytecount=1; } fd.write(bytes,bytecount); } //-----------------------------------------------------------------------------
code snippets are licensed under Creative Commons CC-By-SA 3.0 (unless otherwise specified)
![]() |