#!/bin/bash # renew.sh (Creation of the Wikipedia n-grams for dictionary-analysis) # Copyright (C) 2018 Daniel Marschall, ViaThinkSoft # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software Foundation, # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA DIR=$( dirname "$0" ) cd "$DIR" # 1. Get the Wikipedia XML (63 GB!) wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 bzip2 -d enwiki-latest-pages-articles.xml.bz2 chmod -w enwiki-latest-pages-articles.xml mv enwiki-latest-pages-articles.xml wikipedia_xml/ # 2. Extract and mangle the articles # Total pages: 13,878,481 # Elapsed time: 4:00:12 ./wiki_mangling_generator.py # 3. Create the digrams, trigrams, etc. # --------------------------- # n time memory # --------------------------- # 1 03:31 208 # 2 05:15 5408 # 3 07:15 140608 # 4 12:03 3655808 # 5 25:45 95051008 # 6 46:06 2471326208 # n (exp.) 8 * 26^n Bytes # --------------------------- make mkdir stats ./wiki_make_stats 1 stats/count_en_1.csv mangled/mangled_en_*.txt ./wiki_make_stats 2 stats/count_en_2.csv mangled/mangled_en_*.txt ./wiki_make_stats 3 stats/count_en_3.csv mangled/mangled_en_*.txt ./wiki_make_stats 4 stats/count_en_4.csv mangled/mangled_en_*.txt ./wiki_make_stats 5 stats/count_en_5.csv mangled/mangled_en_*.txt ./wiki_make_stats 6 stats/count_en_6.csv mangled/mangled_en_*.txt # 4. Create Markov and Sinkov (log-weight) statistics # Requires approx. 37 minutes for n=1..6 ./markov_gen.py # Requires approx. 31 minutes for n=1..6 ./sinkov_gen.py