/* * wiki_make_stats.c (counting the digrams, trigrams, quadgrams, n-grams of the mangled Wikipedia articles) * Copyright (C) 2018 Daniel Marschall, ViaThinkSoft * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software Foundation, * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include //#ifdef SHOW_STATUS /* The array that contains the occurrence-counts For sequence "abc", the index is a*26*26 + b*26 + c with a=0, b=1, ... */ #define counter_t int64_t counter_t* gc; int fileexists(char *filename) { struct stat buffer; return stat(filename, &buffer) == 0; } int64_t filesize(const char* filename) { struct stat st; stat(filename, &st); return st.st_size; } const char *human_friendly_size(int64_t bytes, char *str) { // Source: http://collectivesolver.com/5738/how-to-format-bytes-to-kilobytes-megabytes-gigabytes-and-terabytes-in-c #define num_items 5 const char *sizes[num_items] = { "B", "KB", "MB", "GB", "TB" }; int i; double dblByte = bytes; for (i = 0; i < num_items && bytes >= 1024; i++, bytes /= 1024) { dblByte = bytes / 1024.0; } sprintf(str, "%.2f", dblByte); return strcat(strcat(str, " "), sizes[i]); #undef num_items } void show_memory_error(int argc, char** argv, int n, int begin_infiles_idx) { /* Calculate the size of the count-array (size = 26^n) */ int64_t gc_size = 1; for (int i=0; i iBiggestFile) iBiggestFile = iThisSize; } int64_t iGC = gc_size * sizeof(counter_t); int64_t iTotalMem = iGC + iBiggestFile; char s1[32] = ""; char s2[32] = ""; char s3[32] = ""; fprintf(stderr, "Not enough memory! (You need at least %s free memory (%s for n=%d and %s for the biggest input file)\n", human_friendly_size(iTotalMem, s1), human_friendly_size(iGC, s2), n, human_friendly_size(iBiggestFile, s3)); } #define ERR_READLINES_OPEN 1 #define ERR_READLINES_STAT 2 #define ERR_READLINES_MMAP 3 int read_lines(const char * fname, int n) { int fd = open(fname, O_RDONLY); struct stat fs; char *buf, *buf_end; if (fd == -1) { err(1, "open: %s", fname); return ERR_READLINES_OPEN; } if (fstat(fd, &fs) == -1) { err(1, "stat: %s", fname); return ERR_READLINES_STAT; } /* fs.st_size could have been 0 actually */ buf = mmap(0, fs.st_size, PROT_READ, MAP_SHARED, fd, 0); if (buf == (void*) -1) { err(1, "mmap: %s", fname); close(fd); return ERR_READLINES_MMAP; // most likely the failure is out-of-memory } buf_end = buf + fs.st_size; for (char* b=buf; b= 0) && (v < 26)) { idx = idx*26 + v; } else { valid = 0; break; } } if (valid) gc[idx]++; } munmap(buf, fs.st_size); close(fd); return 0; } int main(int argc, char** argv) { /* Parse parameters */ if (argc < 4) { fprintf(stderr, "Syntax: %s \n", argv[0]); return 1; } const int n = atoi(argv[1]); const char* outfile = argv[2]; const int begin_infiles_idx = 3; /* Calculate the size of the count-array (size = 26^n) */ int64_t gc_size = 1; for (int i=0; i=0; --j) { gram[j] = c%26 + 'a'; c /= 26; } // Write the line to the CSV file fprintf(fp, "%s,%" PRId64 "\n", gram, gc[i]); } fclose(fp); return 0; }