#include <ctype.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
/* This program is copyright W. Unruh, 2004 <unruh@physics.ubc.ca> and is
 * released under the GPL license. This copyright notice is not to be
 * removed from  any copies or derivatives of this program.
 */


/* Read a file in from stdin, and calculate the frequency of quatrgrams
 * (four letter combinations) The word starts with C_START and ends with
 * C_END. Only lower case letters are used and only punctuation is - and '.
 * The variable node->count is the number of occurances of the first three
 * letters there are in the input list of words. The node->p[l] is how many
 * occurances of the letter numbered l occur after the first three letters.
 * Only lower case letters are used. 
 * The output is a list of words, randomly made up by choosing the next
 * letter with the probability p[l]/count. The log probability is the log
 * to base 2 (bits) product of p[l]/count for each letter in the word. 
 *
 * Only lower case words are created. The entropy of the passwords
 * generated can be increased by randomly sprinkling capital letters
 * instead of lower case. (If those capitals were truely randomly
 * sprinkled, it would add one bit of entropy for each letter in the
 * password) One could also replace some of the letters with numbers, or
 * with punctuation. This would add about 1/2 a bit of entropy per letter.
 *
 *
 * Useage
 *
 * wget [n] <dictionary
 * 
 * n=number of random words, together with their bit count randomness
 * estimate, based on the dictionary used. (10<=n<=100)
 * It is better to use a dictionary, as the words created tend to be
 * longer. (in text most words are short, and this ups the probability that
 * the output will have short words.) 
 * Eg, the sorted unique words from Shakespeare. Or a massive dictionary or
 * word list would be useful.
 * The general result is that each letter is worth about 3 bits of
 * randomness. This undoubtedly is an underestimate, as the words in the
 * dictionary tend to be obscure (most of them). Thus the probability of
 * the quatrgrams are biased towards unusual words. Of course if an attacker
 * used the same dictionary, then this would give a pretty accurate
 * estimate of the randomness of the made up words.
 * 
 *
 * The program uses /dev/urandom to get the random probabilities to select
 * the letters.
 */

static const char wchars[] = "abcdefghijklmnopqrstuvwxyz-'";
     //list of lower case characters recognized by the program. All others
     //are silently ignored. Note that if you want to extend the list, or
     //use some other language, you must change also the next three
     //constants. V
#define VECSZ 30  // number of characters in list + 2
#define C_START VECSZ-2 //start of word designator.
#define C_END VECSZ-1 //end of word designator.
#define MAXLEN 10 //Default Max length of output word.
typedef struct node {
  unsigned count;
  unsigned p[VECSZ];
} node;
static node (*modelp)[VECSZ][VECSZ][VECSZ];
#define model (*modelp)
FILE *urandom,*infile;
char inputfile[100];

unsigned rrand();
double entropy=0., entropy4=0.;

int getopt(int argc,char * const argv[],const char *optstring);
extern char *optarg;
extern int optind,opterr,optopt;
int matrixflag=0;
int main(int argc,char *argv[])
{
  unsigned i, j, k, l,nout=20,length=MAXLEN,c;
  char *optstring="hl:n:f:m"; //h=help l=max length of output words,n=number of output words
  strcpy(inputfile,"/usr/share/dict/words");
  modelp = malloc(sizeof(model)); //allocate space for the array
  if (!modelp) {
    fprintf(stderr, "no memory");
    exit(EXIT_FAILURE);
  }
  
  while((c=getopt(argc,argv,optstring))!=-1)
  {
     
     switch(c)
     {
        case 'n':
           sscanf(optarg,"%d",&nout);
           if(nout<10||nout>100)
           {
              fprintf(stderr,"Useage: wgen [-l # ] [-n #] <dictionary\n where l >=8 and n between 10 and 100\n Error in l =%d\n",length);
              exit(1);
           }
           break;
        case 'l':
           sscanf(optarg,"%d",&length);
           if(length<8)
           {
              fprintf(stderr,"Useage: wgen [-h] [-l # ] [-n #] [-f {dictionary| - <dictionary}] \n where -l >=8 and -n between 10 and 100\nError in n %d\n",nout);
              exit(1);
           }  
           break;
        case 'h':
           fprintf(stderr, "Useage: wgen [-h] [-l # ] [-n #] [-f {-|dictionary}] \n  -l >=8 is max length of output words\n  -n between 10 and 100\n   -h this help\n   dictionary can be either a dictionary \n      or some text article\n\
        Output:\n\
           total number of trigrams (3 letter combinations)\n\
           Raw Entropy= \n\
           tcount=total number of letters\n\
           MaxLength=maximum length of output word\n\
           Number=number of output words\n\
           Entropy3=sum(p ln(p)) for all trigrams\n\
           Entropy4=sum(pln(p) for all quarigrams\n\
           Word put out with effective bit entropy for word\n\
              (sum(r ln_2(r)) for each character in word\n\
               r=number of quatigrams/number of same trigrams\n");
           exit(0);   
           break;
        case 'f':
           {
              int nc;
            if( (nc=sscanf(optarg, "%99s",inputfile))==99 || nc<1){
                  fprintf(stderr," -f option: Input file too long or non-existant\n");
                  exit(1);
            }
           }
           break;
        case 'm':
           matrixflag=1;
           break;



        case '?':
           fprintf(stderr,"Useage: wgen [-h] [-l # ] [-n #] [-f {dictionary|- <dictionary}]\n where -l >=8 and -n between 10 and 100\n");
           exit(1);

     }
  }
            if ( strcmp (inputfile,"-")==0){
               infile=stdin;
            }else{
               if( (infile=fopen(inputfile, "r"))==(FILE *)NULL)
               {fprintf(stderr," Input file %s cannot be opened.\n",inputfile);
                  exit(1);
               }
            }
     
    

//  if(argc>1) // argument is max number of output words
//  {sscanf(argv[1],"%d",&nout);
//    printf("%d --- %s ---- %d\n",argc,argv[1],nout);
//     if(nout<10)nout=10;
//    if(nout>100)nout=100;
//  }

  for (i = 0; i < VECSZ; i++) 
  {
    for (j = 0; j < VECSZ; j++) 
    {
      for (k = 0; k < VECSZ; k++) 
      {
	      model[i][j][k].count = 0;
	      for (l = 0; l < VECSZ; l++)
	          model[i][j][k].p[l] = 0;
      }
    }
  }

  l = C_END;
  i = j = k = C_START;
  for (;;) {
    int ch = getc(infile);
    const char *p;
    node *n = &model[i][j][k];
    if (ch == EOF || isspace(ch)) 
    {
      if (l != C_END) 
      {
   	  l = C_END;
	     n->count++;
	     n->p[l]++;
	     i = j = k = C_START;
      }
      if (ch == EOF)
	      break;
      continue;
    }
    p = strchr(wchars, tolower(ch));
    if (!p)  //character not in set
      continue;
    l = p - wchars; //number of the character
    n->count++;
    n->p[l]++;
    if(n->count>0x80000000)
    {
      char tmp[VECSZ+1];
      strcpy(tmp,wchars);
      tmp[VECSZ-2]=' ';
      tmp[VECSZ-1]='|';
      tmp[VECSZ]=0x0;
      fprintf(stderr,"Dictionary too long. quatrgram has greater than 2GB count. %d \"%c%c%c%c\"\n",n->count, tmp[i],tmp[j],tmp[k],tmp[l]);
      exit(2);
    } 
    i = j; j = k; k = l;
  }

  {
     FILE *matrixfile;
      char tmp[VECSZ+1];
      long tcount=0,tt=0;
      strcpy(tmp,wchars);
      tmp[VECSZ-2]=' ';
      tmp[VECSZ-1]='|';
      tmp[VECSZ]=0x0;
                              
    if(matrixflag){
         matrixfile= fopen("/tmp/matrix","w+");
         printf("opened matrix\n");
    }
  for(i=0;i<VECSZ;i++)
     for(j=0;j<VECSZ;j++)
        for(k=0;k<VECSZ;k++)
        {
           if(matrixflag) fprintf(matrixfile,"%c%c%c ",tmp[i],tmp[j],tmp[k]);
           for(l=0;l<VECSZ;l++)
           {
               if(matrixflag) fprintf(matrixfile,"%d ",model[i][j][k].p[l]);
               if (model[i][j][k].p[l] != 0)
                  entropy4+=model[i][j][k].p[l]*log(model[i][j][k].p[l]);
           }
           if(matrixflag)fprintf(matrixfile,"%d\n",model[i][j][k].count);
           tcount+=model[i][j][k].count;
           if (model[i][j][k].count != 0)
           {
              entropy+=(double)(model[i][j][k].count)*log((double)(model[i][j][k].count));
              tt+=1;
           }

        }
  printf("total trigrms= %ld raw ent= %e tcount = %ld\n",tt,entropy,tcount);
   entropy=-(entropy/tcount -log(tcount))/log(2.);
   entropy4=-(entropy4/tcount-log(tcount))/log(2.);
  }
  

  
  printf("Max length=%d Number=%d, Entropy3=%f Entropy4=%f\n",length,nout, entropy,entropy4);
  
  
  {
     int nn,nc;char input[256];
     unsigned random;
     urandom=fopen("/dev/urandom","r");
     if(urandom == (FILE *)NULL)
     {
        fprintf(stderr,"Cannot open /dev/urandom\n");
           exit(1);
     }
  for (nn=0;nn<nout;nn++) 
  {
    double prob = 0;
    nc=0;
    i = j = k = C_START;
    for (;;) {
      node *n = &model[i][j][k];
      unsigned p = n->count;
//      unsigned max = RAND_MAX - (RAND_MAX % p);
      unsigned max=0xffffffff-((0xffffffff)%p);
      unsigned r;
      if(++nc>length) break;
      do r = rrand(); while (r >= max);
      r = r%p;
      for (l = 0; r >= n->p[l]; r -= n->p[l++])
	;
      prob -= log((double)n->p[l]/(double)n->count)/log(2);
      
      if (l == C_END)
	break;
      fputc(wchars[l], stdout);
      i = j; j = k; k = l;
    }
    printf(" [%g]\n", prob);
  }
  }

  return (0);
}

unsigned rrand()
{
   unsigned int i,r;
   char *cr;
   cr=(char *)&r;
   for(i=0;i<4;i++)
     cr[i]=fgetc(urandom);
   return(r);
} 
