#include
#include
#include
#include
#include
#include
/* This program is copyright W. Unruh, 2004 and is
* released under the GPL license. This copyright notice is not to be
* removed from any copies or derivatives of this program.
*/
/* Read a file in from a file of words (default /usr/share/dict/words)
* , and calculate the frequency of quatrgrams
* (four letter combinations) The word starts with C_START and ends with
* C_END and these form a part of the quatrgram.
* Only lower case letters are used and only punctuation is - and '.
* The variable node->count is the number of occurances of the first three
* letters there are in the input list of words. The node->p[l] is how many
* occurances of the letter numbered l occur after the first three letters.
* Only lower case letters are used.
* The output is a list of words, randomly made up by choosing the next
* letter with the probability p[l]/count. The log probability is the log
* to base 2 (bits) product of p[l]/count for each letter in the word.
*
* Only lower case words are created. The entropy of the passwords
* generated can be increased by randomly sprinkling capital letters
* instead of lower case. (If those capitals were truely randomly
* sprinkled, it would add one bit of entropy for each letter in the
* password) One could also replace some of the letters with numbers, or
* with punctuation. This would add about 1/2 a bit of entropy per letter.
*
*
* Useage
*
* wget [-l #] [-n #] [-f dictionary|-f - 100)
{
fprintf(stderr,"Useage: wgen [-l # ] [-n #] =8 and n between 10 and 100\n Error in l =%d\n",length);
exit(1);
}
break;
case 'l':
sscanf(optarg,"%d",&length);
if(length<8)
{
fprintf(stderr,"Useage: wgen [-h] [-l # ] [-n #] [-f {dictionary| - =8 and -n between 10 and 100\nError in n %d\n",nout);
exit(1);
}
break;
case 'h':
fprintf(stderr, "Useage: wgen [-h] [-l # ] [-n #] [-f {-|dictionary}] \n -l >=8 is max length of output words\n -n between 10 and 100\n -h this help\n dictionary can be either a dictionary \n or some text article\n\
Output:\n\
total number of trigrams (3 letter combinations)\n\
Raw Entropy= \n\
tcount=total number of letters\n\
MaxLength=maximum length of output word\n\
Number=number of output words\n\
Entropy3=sum(p ln(p)) for all trigrams\n\
Entropy4=sum(pln(p) for all quarigrams\n\
Word put out with effective bit entropy for word\n\
(sum(r ln_2(r)) for each character in word\n\
r=number of quatigrams/number of same trigrams\n");
exit(0);
break;
case 'f':
{
int nc;
if( (nc=sscanf(optarg, "%99s",inputfile))==99 || nc<1){
fprintf(stderr," -f option: Input file too long or non-existant\n");
exit(1);
}
}
break;
case 'm':
matrixflag=1;
break;
case '?':
fprintf(stderr,"Useage: wgen [-h] [-l # ] [-n #] [-f {dictionary|- =8 and -n between 10 and 100\n");
exit(1);
}
}
if ( strcmp (inputfile,"-")==0){
infile=stdin;
}else{
if( (infile=fopen(inputfile, "r"))==(FILE *)NULL)
{fprintf(stderr," Input file %s cannot be opened.\n",inputfile);
exit(1);
}
}
// if(argc>1) // argument is max number of output words
// {sscanf(argv[1],"%d",&nout);
// printf("%d --- %s ---- %d\n",argc,argv[1],nout);
// if(nout<10)nout=10;
// if(nout>100)nout=100;
// }
for (i = 0; i < VECSZ; i++)
{
for (j = 0; j < VECSZ; j++)
{
for (k = 0; k < VECSZ; k++)
{
model[i][j][k].count = 0;
for (l = 0; l < VECSZ; l++)
model[i][j][k].p[l] = 0;
}
}
}
l = C_END;
i = j = k = C_START;
for (;;) {
int ch = getc(infile);
const char *p;
node *n = &model[i][j][k];
if (ch == EOF || isspace(ch))
{
if (l != C_END)
{
l = C_END;
n->count++;
n->p[l]++;
i = j = k = C_START;
}
if (ch == EOF)
break;
continue;
}
p = strchr(wchars, tolower(ch));
if (!p) //character not in set
continue;
l = p - wchars; //number of the character
n->count++;
n->p[l]++;
if(n->count>0x80000000)
{
char tmp[VECSZ+1];
strcpy(tmp,wchars);
tmp[VECSZ-2]=' ';
tmp[VECSZ-1]='|';
tmp[VECSZ]=0x0;
fprintf(stderr,"Dictionary too long. quatrgram has greater than 2GB count. %d \"%c%c%c%c\"\n",n->count, tmp[i],tmp[j],tmp[k],tmp[l]);
exit(2);
}
i = j; j = k; k = l;
}
{
FILE *matrixfile;
char tmp[VECSZ+1];
long tcount=0,tt=0;
strcpy(tmp,wchars);
tmp[VECSZ-2]=' ';
tmp[VECSZ-1]='|';
tmp[VECSZ]=0x0;
if(matrixflag){
matrixfile= fopen("/tmp/matrix","w+");
printf("opened matrix\n");
}
for(i=0;icount;
// unsigned max = RAND_MAX - (RAND_MAX % p);
unsigned max=0xffffffff-((0xffffffff)%p);
unsigned r;
if(++nc>length) break;
do r = rrand(); while (r >= max);
r = r%p;
for (l = 0; r >= n->p[l]; r -= n->p[l++])
;
prob -= log((double)n->p[l]/(double)n->count)/log(2);
if (l == C_END)
break;
fputc(wchars[l], stdout);
i = j; j = k; k = l;
}
printf(" [%g] [%g]\n", prob, nc>0?prob/nc:0);
}
}
return (0);
}
unsigned rrand()
{
unsigned int i,r;
char *cr;
cr=(char *)&r;
for(i=0;i<4;i++)
cr[i]=fgetc(urandom);
return(r);
}