#include #include #include #include #include #include /* This program is copyright W. Unruh, 2004 and is * released under the GPL license. This copyright notice is not to be * removed from any copies or derivatives of this program. */ /* Read a file in from a file of words (default /usr/share/dict/words) * , and calculate the frequency of quatrgrams * (four letter combinations) The word starts with C_START and ends with * C_END and these form a part of the quatrgram. * Only lower case letters are used and only punctuation is - and '. * The variable node->count is the number of occurances of the first three * letters there are in the input list of words. The node->p[l] is how many * occurances of the letter numbered l occur after the first three letters. * Only lower case letters are used. * The output is a list of words, randomly made up by choosing the next * letter with the probability p[l]/count. The log probability is the log * to base 2 (bits) product of p[l]/count for each letter in the word. * * Only lower case words are created. The entropy of the passwords * generated can be increased by randomly sprinkling capital letters * instead of lower case. (If those capitals were truely randomly * sprinkled, it would add one bit of entropy for each letter in the * password) One could also replace some of the letters with numbers, or * with punctuation. This would add about 1/2 a bit of entropy per letter. * * * Useage * * wget [-l #] [-n #] -d # [-f dictionary|-f - 100) { fprintf(stderr,"Useage: wgen [-l # ] [-n #] l >=8, n between 10 and 100 and d less than 4\n Error in n =%d\n",nout); exit(1); } break; case 'l': sscanf(optarg,"%d",&length); if(length<8 || length>99) { fprintf(stderr,"Useage: wgen [-h] [-l # ] [-n #] [-f {dictionary| - -l >=8, -n between 10 and 100, -d <= 4\nError in l %d\n",length); exit(1); } break; case 'd' : sscanf(optarg, "%f",&probdensity); if(probdensity>4.1) { fprintf(stderr,"Useage: wgen [-h] [-l # ] [-n #] [-f {dictionary| - -l >=8 and -n between 10 and 100 and d <=4 \nError in d %f \n",probdensity); exit(1); } break; case 'h': fprintf(stderr, "Useage: wgen [-h] [-l # ] [-n #] [-f {-|dictionary}] \n -l >=8 is max length of output words\n -n between 10 and 100 \n d less than or equal to 4 is the min entropy per character\n -h this help\n dictionary can be either a dictionary \n or some text article\n\ Output:\n\ total number of trigrams (3 letter combinations)\n\ Raw Entropy= \n\ tcount=total number of letters\n\ MaxLength=maximum length of output word\n\ Number=number of output words\n\ Entropy3=sum(p ln(p)) for all trigrams\n\ Entropy4=sum(pln(p) for all quarigrams\n\ Word put out with effective bit entropy for word\n\ (sum(r ln_2(r)) for each character in word\n\ r=number of quatigrams/number of same trigrams\n"); exit(0); break; case 'f': { int nc; if( (nc=sscanf(optarg, "%99s",inputfile))==99 || nc<1){ fprintf(stderr," -f option: Input file too long or non-existant\n"); exit(1); } } break; case 'm': matrixflag=1; break; case '?': fprintf(stderr,"Useage: wgen [-h] [-l # ] [-n #] [-f {dictionary|- =8 and -n between 10 and 100\n"); exit(1); } } if ( strcmp (inputfile,"-")==0){ infile=stdin; }else{ if( (infile=fopen(inputfile, "r"))==(FILE *)NULL) {fprintf(stderr," Input file %s cannot be opened.\n",inputfile); exit(1); } } // if(argc>1) // argument is max number of output words // {sscanf(argv[1],"%d",&nout); // printf("%d --- %s ---- %d\n",argc,argv[1],nout); // if(nout<10)nout=10; // if(nout>100)nout=100; // } for (i = 0; i < VECSZ; i++) { for (j = 0; j < VECSZ; j++) { for (k = 0; k < VECSZ; k++) { model[i][j][k].count = 0; for (l = 0; l < VECSZ; l++) model[i][j][k].p[l] = 0; } } } l = C_END; i = j = k = C_START; for (;;) { int ch = getc(infile); const char *p; node *n = &model[i][j][k]; if (ch == EOF || isspace(ch)) { if (l != C_END) { l = C_END; n->count++; n->p[l]++; i = j = k = C_START; } if (ch == EOF) break; continue; } p = strchr(wchars, tolower(ch)); if (!p) //character not in set continue; l = p - wchars; //number of the character n->count++; n->p[l]++; if(n->count>0x80000000) { char tmp[VECSZ+1]; strcpy(tmp,wchars); tmp[VECSZ-2]=' '; tmp[VECSZ-1]='|'; tmp[VECSZ]=0x0; fprintf(stderr,"Dictionary too long. quatrgram has greater than 2GB count. %d \"%c%c%c%c\"\n",n->count, tmp[i],tmp[j],tmp[k],tmp[l]); exit(2); } i = j; j = k; k = l; } { FILE *matrixfile; char tmp[VECSZ+1]; long tcount=0,tt=0; strcpy(tmp,wchars); tmp[VECSZ-2]=' '; tmp[VECSZ-1]='|'; tmp[VECSZ]=0x0; if(matrixflag){ matrixfile= fopen("/tmp/matrix","w+"); printf("opened matrix\n"); } for(i=0;icount; // unsigned max = RAND_MAX - (RAND_MAX % p); unsigned max=0xffffffff-((0xffffffff)%p); unsigned r; if(++nc>length) { nc--; break; } do r = rrand(); while (r >= max); r = r%p; for (l = 0; r >= n->p[l]; r -= n->p[l++]) ; prob -= log((double)n->p[l]/(double)n->count)/log(2); if (l == C_END) { output[nc-1]=' '; break; } else { // fputc(wchars[l], stdout); output[nc-1]=wchars[l]; } i = j; j = k; k = l; } if (prob/nc>probdensity) printf(" %s-- [%g] [%g]\n",output, prob, nc>0?prob/(nc):0); else { // printf(".\n"); nn--; } } } return (0); } unsigned rrand() { unsigned int i,r; char *cr; cr=(char *)&r; for(i=0;i<4;i++) cr[i]=fgetc(urandom); return(r); }