Incremental update. Continuing work on lattice codebooks.

svn path=/branches/new_acoustics_pending_merge_20000328/vorbis/; revision=345
author: Monty <xiphmont@xiph.org> 2000-04-26 07:10:16 +0000
committer: Monty <xiphmont@xiph.org> 2000-04-26 07:10:16 +0000
commit: de5ef5ea3d8ca2fffac1d247520aea82c7bf11f5 (patch)
tree: 116c839881d24235e1cafed6409a6afed123345b
parent: 097027d02df14cab9f0ffe712995aa7f7f16f392 (diff)
download: libvorbis-git-de5ef5ea3d8ca2fffac1d247520aea82c7bf11f5.tar.gz
8 files changed, 631 insertions, 248 deletions
diff --git a/include/vorbis/codebook.h b/include/vorbis/codebook.h
index ee4b013e..8e45c635 100644
--- a/include/vorbis/codebook.h
+++ b/include/vorbis/codebook.h
@@ -12,7 +12,7 @@
  ********************************************************************
 
  function: codebook types
- last mod: $Id: codebook.h,v 1.4.4.5 2000/04/21 16:35:38 xiphmont Exp $
+ last mod: $Id: codebook.h,v 1.4.4.6 2000/04/26 07:10:15 xiphmont Exp $
 
  ********************************************************************/
 
@@ -81,7 +81,8 @@ typedef struct encode_aux_nearestmatch{
 typedef struct encode_aux_threshmatch{
   double *quantthresh;
   long   *quantmap;
-  int     quantvals;
+  int     quantvals; 
+  int     threshvals; 
 } encode_aux_threshmatch;
 
 typedef struct decode_aux{
diff --git a/lib/sharedbook.c b/lib/sharedbook.c
index 0c144bcc..bc3987d4 100644
--- a/lib/sharedbook.c
+++ b/lib/sharedbook.c
@@ -12,7 +12,7 @@
  ********************************************************************
 
  function: basic shared codebook operations
- last mod: $Id: sharedbook.c,v 1.1.2.5 2000/04/21 16:35:40 xiphmont Exp $
+ last mod: $Id: sharedbook.c,v 1.1.2.6 2000/04/26 07:10:15 xiphmont Exp $
 
  ********************************************************************/
 
@@ -323,6 +323,27 @@ int _best(codebook *book, double *a, int step){
   int dim=book->dim;
   int ptr=0,k,o;
 
+  /* we assume for now that a thresh tree is the only other possibility */
+  if(tt){
+    int index=0;
+    /* find the quant val of each scalar */
+    for(k=0,o=step*(dim-1);k<dim;k++,o-=step){
+      int i;
+      /* linear search the quant list for now; it's small and although
+	 with > 8 entries, it would be faster to bisect, this would be
+	 a misplaced optimization for now */
+      for(i=0;i<tt->threshvals-1;i++)
+	if(a[o]<tt->quantthresh[i])break;
+
+      index=(index*tt->quantvals)+tt->quantmap[i];
+    }
+    /* regular lattices are easy :-) */
+    if(book->c->lengthlist[index]>0) /* is this unused?  If so, we'll
+					use a decision tree after all
+					and fall through*/
+      return(index);
+  }
+
   if(nt){
     /* optimized using the decision tree */
     while(1){
@@ -342,23 +363,6 @@ int _best(codebook *book, double *a, int step){
     return(-ptr);
   }
 
-  /* we assume for now that a thresh tree is the only other possibility */
-  if(tt){
-    int index=0;
-    /* find the quant val of each scalar */
-    for(k=0,o=step*(dim-1);k<dim;k++,o-=step){
-      int i;
-      /* linear search the quant list for now; it's small and although
-	 with > 8 entries, it would be faster to bisect, this would be
-	 a misplaced optimization for now */
-      for(i=0;i<tt->quantvals-1;i++)
-	if(a[o]<tt->quantthresh[i])break;
-
-      index=(index*tt->quantvals)+tt->quantmap[i];
-    }
-    /* regular lattices are easy :-) */
-    return(index);
-  }
   return(-1);
 }
 
diff --git a/vq/Makefile.in b/vq/Makefile.in
index 34a20603..02a1199e 100644
--- a/vq/Makefile.in
+++ b/vq/Makefile.in
@@ -1,4 +1,4 @@
-# $Id: Makefile.in,v 1.10.4.2 2000/04/21 16:35:40 xiphmont Exp $
+# $Id: Makefile.in,v 1.10.4.3 2000/04/26 07:10:15 xiphmont Exp $
 
 ###############################################################################
 #                                                                             #
@@ -29,7 +29,8 @@ HFILES =	../include/vorbis/codebook.h vqgen.h vqext.h bookutil.h
 
 OFILES = 	vqgen.o vqsplit.o bookutil.o ../lib/sharedbook.o
 ALLOFILES = 	$(OFILES) lspdata.o genericdata.o train.o build.o run.o\
-		cascade.o partition.o metrics.o residuedata.o latticebuild.o
+		cascade.o partition.o metrics.o residuedata.o latticebuild.o\
+		latticepare.o
 
 all:
 	$(MAKE) target CFLAGS="$(OPT)"
@@ -40,7 +41,7 @@ debug:
 profile: 
 	$(MAKE) target CFLAGS="$(PROFILE)"
 
-target:	lspvqtrain genericvqtrain residuevqtrain vqbuild vqcascade vqmetrics latticebuild
+target:	lspvqtrain genericvqtrain residuevqtrain vqbuild vqcascade vqmetrics latticebuild latticepare
 
 lspvqtrain:	$(OFILES) lspdata.o train.o
 	$(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@ $(LIBS)
@@ -63,6 +64,9 @@ vqmetrics:	$(OFILES) run.o metrics.o
 latticebuild:	$(OFILES) latticebuild.o
 	$(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@ $(LIBS)
 
+latticepare:	$(OFILES) latticepare.o
+	$(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@ $(LIBS)
+
 
 $(ALLOFILES): 	$(HFILES)
 
diff --git a/vq/bookutil.c b/vq/bookutil.c
index a1874859..e479f7fe 100644
--- a/vq/bookutil.c
+++ b/vq/bookutil.c
@@ -12,7 +12,7 @@
  ********************************************************************
 
  function: utility functions for loading .vqh and .vqd files
- last mod: $Id: bookutil.c,v 1.12.4.3 2000/04/21 16:35:40 xiphmont Exp $
+ last mod: $Id: bookutil.c,v 1.12.4.4 2000/04/26 07:10:15 xiphmont Exp $
 
  ********************************************************************/
 
@@ -187,9 +187,6 @@ codebook *codebook_load(char *filename){
   char *line;
   long i;
 
-  c->nearest_tree=a;
-  c->thresh_tree=t;
-
   if(in==NULL){
     fprintf(stderr,"Couldn't open codebook %s\n",filename);
     exit(1);
@@ -217,7 +214,7 @@ codebook *codebook_load(char *filename){
   /* find the auxiliary encode struct[s] (if any) */
   if(find_seek_to(in,"static encode_aux_nearestmatch _vq_aux_")){
     /* how big? */
-    a=calloc(1,sizeof(encode_aux_nearestmatch));
+    c->nearest_tree=a=calloc(1,sizeof(encode_aux_nearestmatch));
     line=get_line(in);
     line=get_line(in);
     line=get_line(in);
@@ -272,29 +269,35 @@ codebook *codebook_load(char *filename){
   
   if(find_seek_to(in,"static encode_aux_threshmatch _vq_aux_")){
     /* how big? */
-    t=calloc(1,sizeof(encode_aux_threshmatch));
+    c->thresh_tree=t=calloc(1,sizeof(encode_aux_threshmatch));
+    line=get_line(in);
     line=get_line(in);
     line=get_line(in);
     if(sscanf(line,"%d",&(t->quantvals))!=1){
-      fprintf(stderr,"2: syntax in %s in line:\t %s",filename,line);
+      fprintf(stderr,"3: syntax in %s in line:\t %s",filename,line);
+      exit(1);
+    }
+    line=get_line(in);
+    if(sscanf(line,"%d",&(t->threshvals))!=1){
+      fprintf(stderr,"4: syntax in %s in line:\t %s",filename,line);
       exit(1);
     }
     /* load quantthresh */
-    find_seek_to(in,"static long _vq_quantthresh_");
+    find_seek_to(in,"static double _vq_quantthresh_");
     reset_next_value();
-    t->quantthresh=malloc(sizeof(long)*t->quantvals);
-    for(i=0;i<t->quantvals;i++)
+    t->quantthresh=malloc(sizeof(double)*t->threshvals);
+    for(i=0;i<t->threshvals-1;i++)
       if(get_next_value(in,t->quantthresh+i)){
-	fprintf(stderr,"out of data while reading codebook %s\n",filename);
+	fprintf(stderr,"out of data 1 while reading codebook %s\n",filename);
 	exit(1);
       }    
     /* load quantmap */
     find_seek_to(in,"static long _vq_quantmap_");
     reset_next_value();
-    t->quantmap=malloc(sizeof(long)*t->quantvals);
-    for(i=0;i<t->quantvals;i++)
+    t->quantmap=malloc(sizeof(long)*t->threshvals);
+    for(i=0;i<t->threshvals;i++)
       if(get_next_ivalue(in,t->quantmap+i)){
-	fprintf(stderr,"out of data while reading codebook %s\n",filename);
+	fprintf(stderr,"out of data 2 while reading codebook %s\n",filename);
 	exit(1);
       }    
   }
diff --git a/vq/latticebuild.c b/vq/latticebuild.c
index 65aca502..9b0a22fa 100644
--- a/vq/latticebuild.c
+++ b/vq/latticebuild.c
@@ -12,7 +12,7 @@
  ********************************************************************
 
  function: utility main for building codebooks from lattice descriptions
- last mod: $Id: latticebuild.c,v 1.1.2.1 2000/04/21 16:35:40 xiphmont Exp $
+ last mod: $Id: latticebuild.c,v 1.1.2.2 2000/04/26 07:10:16 xiphmont Exp $
 
  ********************************************************************/
 
@@ -31,12 +31,13 @@
    command line:
    vqlattice description.vql datafile.vqd 
 
-   the lattice description file contains four lines:
+   the lattice description file contains five lines:
 
    <n> <dim> <multiplicitavep>
    <value_0> <value_1> <value_2> ... <value_n-1>
-   <thresh_0> <thresh_1> <thresh_2> ... <thresh_n-2>
-   <map_0> <map_1> <map_2> ... <map_n-1>
+   <m>
+   <thresh_0> <thresh_1> <thresh_2> ... <thresh_m-2>
+   <map_0> <map_1> <map_2> ... <map_m-1>
 
    vqlattice sends residual data (for the next stage) to stdout, and
    produces description.vqh */
@@ -48,7 +49,7 @@ int main(int argc,char *argv[]){
   double *quantlist;
   long *hits;
 
-  int entries=-1,dim=-1,quantvals=-1,addmul=-1;
+  int entries=-1,dim=-1,quantvals=-1,addmul=-1,threshvals=-1;
   FILE *out=NULL;
   FILE *in=NULL;
   char *line,*name;
@@ -105,9 +106,6 @@ int main(int argc,char *argv[]){
   c.maptype=1;
   c.q_sequencep=0;
   c.quantlist=calloc(quantvals,sizeof(long));
-  t.quantthresh=calloc(quantvals-1,sizeof(double));
-  t.quantmap=calloc(quantvals,sizeof(int));
-  t.quantvals=quantvals;
 
   quantlist=malloc(sizeof(long)*c.dim*c.entries);
   hits=malloc(c.entries*sizeof(long));
@@ -122,18 +120,29 @@ int main(int argc,char *argv[]){
     }
   }
 
+  line=setup_line(in);
+  if(sscanf(line,"%d",&threshvals)!=1){
+    fprintf(stderr,"Syntax error reading book file (line 3)\n");
+    exit(1);
+  }
+  
+  t.quantthresh=calloc(threshvals-1,sizeof(double));
+  t.quantmap=calloc(threshvals,sizeof(int));
+  t.threshvals=threshvals;
+  t.quantvals=quantvals;
+
   setup_line(in);
-  for(j=0;j<quantvals-1;j++){  
+  for(j=0;j<threshvals-1;j++){  
     if(get_line_value(in,t.quantthresh+j)==-1){
-      fprintf(stderr,"Ran out of data on line 3 of description file\n");
+      fprintf(stderr,"Ran out of data on line 4 of description file\n");
       exit(1);
     }
   }
 
   setup_line(in);
-  for(j=0;j<quantvals;j++){  
+  for(j=0;j<threshvals;j++){  
     if(get_next_ivalue(in,t.quantmap+j)==-1){
-      fprintf(stderr,"Ran out of data on line 4 of description file\n");
+      fprintf(stderr,"Ran out of data on line 5 of description file\n");
       exit(1);
     }
   }
@@ -141,12 +150,14 @@ int main(int argc,char *argv[]){
   /* gen a real quant list from the more easily human-grokked input */
   {
     double min=quantlist[0];
-    double mindel=fabs(quantlist[0]-quantlist[1]);
+    double mindel=1;
     for(j=1;j<quantvals;j++){  
       if(quantlist[j]<min)min=quantlist[j];
       for(k=0;k<j;k++){
-	double del=fabs(quantlist[j]-quantlist[k]);
-	if(del<mindel)mindel=del;
+	double del=quantlist[k]-min;
+	/* really underpowered :-P know that this will only factor
+           powers of two (duh) */
+	while((int)(del/mindel)+.01<del/mindel){mindel/=2;}
       }
     }
     c.q_min=_float32_pack(min);
@@ -266,19 +277,19 @@ int main(int argc,char *argv[]){
 
   /* quantthresh */
   fprintf(out,"static double _vq_quantthresh_%s[] = {\n",name);
-  for(j=0;j<t.quantvals-1;){
+  for(j=0;j<t.threshvals-1;){
     fprintf(out,"\t");
-    for(k=0;k<8 && j<t.quantvals-1;k++,j++)
-      fprintf(out,"%05g,",t.quantthresh[j]);
+    for(k=0;k<8 && j<t.threshvals-1;k++,j++)
+      fprintf(out,"%.5g, ",t.quantthresh[j]);
     fprintf(out,"\n");
   }
   fprintf(out,"};\n\n");
 
   /* quantmap */
   fprintf(out,"static long _vq_quantmap_%s[] = {\n",name);
-  for(j=0;j<t.quantvals;){
+  for(j=0;j<t.threshvals;){
     fprintf(out,"\t");
-    for(k=0;k<8 && j<t.quantvals;k++,j++)
+    for(k=0;k<8 && j<t.threshvals;k++,j++)
       fprintf(out,"%5ld,",t.quantmap[j]);
     fprintf(out,"\n");
   }
@@ -289,13 +300,14 @@ int main(int argc,char *argv[]){
   fprintf(out,"static encode_aux_threshmatch _vq_aux_%s = {\n",name);
   fprintf(out,"\t_vq_quantthresh_%s,\n",name);
   fprintf(out,"\t_vq_quantmap_%s,\n",name);
-  fprintf(out,"\t%d\n};\n\n",t.quantvals);
+  fprintf(out,"\t%d,\n",t.quantvals);
+  fprintf(out,"\t%d\n};\n\n",t.threshvals);
   
   fprintf(out,"static static_codebook _vq_book_%s = {\n",name);
   
   fprintf(out,"\t%ld, %ld,\n",c.dim,c.entries);
   fprintf(out,"\t_vq_lengthlist_%s,\n",name);
-  fprintf(out,"\t2, %ld, %ld, %d, %d,\n",
+  fprintf(out,"\t1, %ld, %ld, %d, %d,\n",
 	  c.q_min,c.q_delta,c.q_quant,c.q_sequencep);
   fprintf(out,"\t_vq_quantlist_%s,\n",name);
   fprintf(out,"\tNULL,\n");
diff --git a/vq/latticepare.c b/vq/latticepare.c
new file mode 100644
index 00000000..2b9b5086
--- /dev/null
+++ b/vq/latticepare.c
@@ -0,0 +1,392 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE Ogg Vorbis SOFTWARE CODEC SOURCE CODE.  *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS SOURCE IS GOVERNED BY *
+ * THE GNU PUBLIC LICENSE 2, WHICH IS INCLUDED WITH THIS SOURCE.    *
+ * PLEASE READ THESE TERMS DISTRIBUTING.                            *
+ *                                                                  *
+ * THE OggSQUISH SOURCE CODE IS (C) COPYRIGHT 1994-2000             *
+ * by Monty <monty@xiph.org> and The XIPHOPHORUS Company            *
+ * http://www.xiph.org/                                             *
+ *                                                                  *
+ ********************************************************************
+
+ function: utility for paring low hit count cells from lattice codebook
+ last mod: $Id: latticepare.c,v 1.1.2.1 2000/04/26 07:10:16 xiphmont Exp $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+#include <errno.h>
+#include "vorbis/codebook.h"
+#include "../lib/sharedbook.h"
+#include "bookutil.h"
+
+/* Lattice codebooks have two strengths: important fetaures that are
+   poorly modelled by global error minimization training (eg, strong
+   peaks) are not neglected 2) compact quantized representation.
+
+   A fully populated lattice codebook, however, swings point 1 too far
+   in the opposite direction; rare features need not be modelled quite
+   so religiously and as such, we waste bits unless we eliminate the
+   least common cells.  The codebook rep supports unused cells, so we
+   need to tag such cells and build an auxiliary (non-thresh) search
+   mechanism to find the proper match quickly */
+
+/* two basic steps; first is pare the cell for which dispersal creates
+   the least additional error.  This will naturally choose
+   low-population cells and cells that have not taken on points from
+   neighboring paring (but does not result in the lattice collapsing
+   inward and leaving low population ares totally unmodelled).  After
+   paring has removed the desired number of cells, we need to build an
+   auxiliary search for each culled point */
+
+/* Although lattice books (due to threshhold-based matching) do not
+   actually use error to make cell selections (in fact, it need not
+   bear any relation), the 'secondbest' entry finder here is in fact
+   error/distance based, so latticepare is only useful on such books */
+
+/* command line:
+   latticepare latticebook.vqh input_data.vqd <target_cells>
+
+   produces a new output book on stdout 
+*/
+
+void usage(){
+  fprintf(stderr,"latticepare latticebook.vqh input_data.vqd <target_cells>\n"
+	  "produces a new output book on stdout \n");
+  exit(1);
+}
+
+static double _dist(int el,double *a, double *b){
+  int i;
+  double acc=0.;
+  for(i=0;i<el;i++){
+    double val=(a[i]-b[i]);
+    acc+=val*val;
+  }
+  return(acc);
+}
+
+static double *pointlist;
+static long points=0;
+static long allocated=0;
+
+void add_vector(codebook *b,double *vec,long n){
+  int dim=b->dim,i,j;
+  for(i=0;i<n/dim;i++){
+    for(j=i;j<n;j+=dim){
+      if(points>=allocated){
+	if(allocated){
+	  allocated*=2;
+	  pointlist=realloc(pointlist,allocated*dim*sizeof(double));
+	}else{
+	  allocated=1024*1024;
+	  pointlist=malloc(allocated*dim*sizeof(double));
+	}
+      }
+
+      pointlist[points++]=vec[j];
+      if(!(points&0xff))spinnit("loading... ",points);
+    }
+  }
+}
+
+/* search neighboring [non-culled] cells for lowest distance.  Spiral
+   out in the event culling is deep */
+static int secondbest(codebook *b,double *vec,int best){
+  encode_aux_threshmatch *tt=b->c->thresh_tree;
+  int dim=b->dim;
+  int i,j,spiral=1;
+  int *index=alloca(dim*sizeof(int)*2);
+  int *mod=index+dim;
+  int entry=best;
+
+  double bestmetric=0;
+  int bestentry=-1;
+
+  /* decompose index */
+  for(i=0;i<dim;i++){
+    index[i]=best%tt->quantvals;
+    best/=tt->quantvals;
+  }
+
+  /* hit one off on all sides of it; most likely we'll find a possible
+     match */
+  for(i=0;i<dim;i++){
+    /* one up */
+    if(index[i]+1<tt->quantvals){
+      int newentry=entry+rint(pow(tt->quantvals,i));
+      if(b->c->lengthlist[newentry]>0){
+	double thismetric=_dist(dim, vec, b->valuelist+newentry*dim);
+
+	if(bestentry==-1 || bestmetric>thismetric){
+	  bestmetric=thismetric;
+	  bestentry=newentry;
+	}
+      }
+    }
+      
+    /* one down */
+    if(index[i]-1>=0){
+      int newentry=entry-rint(pow(tt->quantvals,i));
+      if(b->c->lengthlist[newentry]>0){
+	double thismetric=_dist(dim, vec, b->valuelist+newentry*dim);
+
+	if(bestentry==-1 || bestmetric>thismetric){
+	  bestmetric=thismetric;
+	  bestentry=newentry;
+	}
+      }
+    }
+  }
+      
+  /* no match?  search all cells, binary count, that are one away on
+     one or more axes.  Then continue out until there's a match.
+     We'll find one eventually, it's relatively OK to be inefficient
+     as the attempt above will almost always succeed */
+  while(bestentry==-1){
+    for(i=0;i<dim;i++)mod[i]= -spiral;
+    while(1){
+      int newentry=entry;
+
+      /* update the mod */
+      for(j=0;j<dim;j++){
+	if(mod[j]<=spiral)
+	  break;
+	else{
+	  if(j+1<dim){
+	    mod[j]= -spiral;
+	    mod[j+1]++;
+	  }
+	}
+      }
+      if(j==dim)break;
+
+      /* reconstitute the entry */
+      for(j=0;j<dim;j++){
+	if(index[j]+mod[j]<0 || index[j]+mod[j]>=tt->quantvals){
+	  newentry=-1;
+	  break;
+	}
+	newentry+=mod[j]*rint(pow(tt->quantvals,j));
+      }
+
+      if(newentry!=-1 && newentry!=entry)
+	if(b->c->lengthlist[newentry]>0){
+	  double thismetric=_dist(dim, vec, b->valuelist+newentry*dim);
+
+	  if(bestentry==-1 || bestmetric>thismetric){
+	    bestmetric=thismetric;
+	    bestentry=newentry;
+	  }
+	}
+      mod[0]++;
+    }
+    spiral++;
+  }
+
+  return(bestentry);
+}
+
+
+int main(int argc,char *argv[]){
+  char *basename;
+  codebook *b=NULL;
+  int input=0;
+  int entries;
+  int dim;
+  long i,j,target=-1;
+  argv++;
+
+  if(*argv==NULL){
+    usage();
+    exit(1);
+  }
+
+  /* yes, this is evil.  However, it's very convenient to parse file
+     extentions */
+
+  while(*argv){
+    if(*argv[0]=='-'){
+      /* option */
+    }else{
+      /* input file.  What kind? */
+      char *dot;
+      char *ext=NULL;
+      char *name=strdup(*argv++);
+      dot=strrchr(name,'.');
+      if(dot)
+        ext=dot+1;
+      else{
+	ext="";
+	target=atol(name);
+      }
+      
+
+      /* codebook */
+      if(!strcmp(ext,"vqh")){
+        if(input){
+          fprintf(stderr,"specify all input data (.vqd) files following\n"
+                  "codebook header (.vqh) files\n");
+          exit(1);
+        }
+
+        basename=strrchr(name,'/');
+        if(basename)
+          basename=strdup(basename)+1;
+        else
+          basename=strdup(name);
+        dot=strrchr(basename,'.');
+        if(dot)*dot='\0';
+
+        b=codebook_load(name);
+      }
+
+      /* data file; we do actually need to suck it into memory */
+      /* we're dealing with just one book, so we can de-interleave */ 
+      if(!strcmp(ext,"vqd")){
+        int cols;
+        long lines=0;
+        char *line;
+        double *vec;
+        FILE *in=fopen(name,"r");
+        if(!in){
+          fprintf(stderr,"Could not open input file %s\n",name);
+          exit(1);
+        }
+
+        reset_next_value();
+        line=setup_line(in);
+        /* count cols before we start reading */
+        {
+          char *temp=line;
+          while(*temp==' ')temp++;
+          for(cols=0;*temp;cols++){
+            while(*temp>32)temp++;
+            while(*temp==' ')temp++;
+          }
+        }
+        vec=alloca(cols*sizeof(double));
+        while(line){
+          lines++;
+          for(j=0;j<cols;j++)
+            if(get_line_value(in,vec+j)){
+              fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
+              exit(1);
+            }
+	  /* deinterleave, add to heap */
+	  add_vector(b,vec,cols);
+
+          line=setup_line(in);
+        }
+        fclose(in);
+      }
+    }
+  }
+  dim=b->dim;
+  entries=b->entries;
+
+  if(target==-1){
+    fprintf(stderr,"Target number of cells required on command line\n");
+    exit(1);
+  }
+
+  /* set up auxiliary vectors for error tracking */
+  {
+    long *membership=malloc(points*sizeof(long));
+    long *cellhead=malloc(entries*sizeof(long));
+    double *cellerror1=calloc(entries,sizeof(double)); /* error for
+                                                          firstentries */
+    double *cellerror2=calloc(entries,sizeof(double)); /* error for
+                                                          secondentry */
+    double globalerror=0.;
+    long cellsleft=entries;
+    for(i=0;i<points;i++)membership[i]=-1;
+    for(i=0;i<entries;i++)cellhead[i]=-1;
+
+    for(i=0;i<points;i++){
+      /* assign vectors to the nearest cell.  Also keep track of second
+	 nearest for error statistics */
+      double *ppt=pointlist+i*dim;
+      int    firstentry=_best(b,ppt,1);
+      int    secondentry=secondbest(b,ppt,firstentry);
+
+      double firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt);
+      double secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
+      
+      if(!(i&0xff))spinnit("initializing... ",points-i);
+    
+      membership[i]=cellhead[firstentry];
+      cellhead[firstentry]=i;
+
+      cellerror1[firstentry]+=firstmetric;
+      globalerror+=firstmetric;
+      cellerror2[firstentry]+=secondmetric;
+
+    }
+
+    while(cellsleft>target){
+      int bestcell=-1;
+      double besterror=0;
+      long head=-1;
+      spinnit("cells left to eliminate... ",cellsleft-target);
+
+      /* find the cell with lowest removal impact */
+      for(i=0;i<entries;i++){
+	if(b->c->lengthlist[i]>0){
+	  double thiserror=globalerror-cellerror1[i]+cellerror2[i];
+	  if(bestcell==-1 || besterror>thiserror){
+	    besterror=thiserror;
+	    bestcell=i;
+	  }
+	}
+      }
+
+      /* disperse it.  move each point out, adding it (properly) to
+         the second best */
+      b->c->lengthlist[bestcell]=0;
+      head=cellhead[bestcell];
+      cellhead[bestcell]=-1;
+      while(head!=-1){
+	/* head is a point number */
+	double *ppt=pointlist+head*dim;
+	int newentry=secondbest(b,ppt,bestcell);
+	int secondentry=secondbest(b,pointlist+head*dim,newentry);
+	double firstmetric=_dist(dim,b->valuelist+dim*newentry,ppt);
+	double secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
+	long next=membership[head];
+	cellerror1[newentry]+=firstmetric;
+	cellerror2[newentry]+=secondmetric;
+
+	membership[head]=cellhead[newentry];
+	cellhead[newentry]=head;
+	head=next;
+      }
+
+      cellsleft--;
+    }
+    free(membership);
+    free(cellhead);
+    free(cellerror1);
+    free(cellerror2);
+  }
+
+  for(i=0;i<entries;i++)
+    fprintf(stderr,"%ld, ",b->c->lengthlist[i]);
+
+  /* paring is over.  Build decision trees using points that now fall
+     through the thresh matcher */
+
+
+
+  fprintf(stderr,"\r                                        \nDone.");
+  return(0);
+}
+
+
+
+
diff --git a/vq/metrics.c b/vq/metrics.c
index 90e19ea9..201521f4 100644
--- a/vq/metrics.c
+++ b/vq/metrics.c
@@ -12,7 +12,7 @@
  ********************************************************************
 
  function: function calls to collect codebook metrics
- last mod: $Id: metrics.c,v 1.6.4.2 2000/04/21 16:35:40 xiphmont Exp $
+ last mod: $Id: metrics.c,v 1.6.4.3 2000/04/26 07:10:16 xiphmont Exp $
 
  ********************************************************************/
 
@@ -24,57 +24,63 @@
 #include "../lib/sharedbook.h"
 #include "bookutil.h"
 
+/* collect the following metrics:
+
+   mean and mean squared amplitude
+   mean and mean squared error 
+   mean and mean squared error (per sample) by entry
+   worst case fit by entry
+   entry cell size
+   hits by entry
+   total bits
+   total samples
+   (average bits per sample)*/
+   
+
 /* set up metrics */
 
 double meanamplitude_acc=0.;
 double meanamplitudesq_acc=0.;
 double meanerror_acc=0.;
 double meanerrorsq_acc=0.;
-double meandev_acc=0.;
-
-double *histogram=NULL;
-double *histogram_error=NULL;
-double *histogram_errorsq=NULL;
-double *histogram_distance=NULL;
-double *histogram_hi=NULL;
-double *histogram_lo=NULL;
 
+double **histogram=NULL;
+double **histogram_error=NULL;
+double **histogram_errorsq=NULL;
+double **histogram_hi=NULL;
+double **histogram_lo=NULL;
+double bits=0.;
 double count=0.;
 
-int books=0;
-int lastdim=-1;
-
 static double *_now(codebook *c, int i){
   return c->valuelist+i*c->c->dim;
 }
 
-int histerrsort(const void *a, const void *b){
-  double av=histogram_distance[*((long *)a)];
-  double bv=histogram_distance[*((long *)b)];
-  if(av<bv)return(-1);
-  return(1);
-}
+int books=0;
 
 void process_preprocess(codebook **bs,char *basename){
-  while(bs[books]){
-    codebook *b=bs[books];
-    lastdim=b->c->dim;
-    books++;
-  }
-
-
+  int i;
+  while(bs[books])books++;
+  
   if(books){
-    const static_codebook *b=bs[books-1]->c;
-    histogram=calloc(b->entries,sizeof(double));
-    histogram_distance=calloc(b->entries,sizeof(double));
-    histogram_errorsq=calloc(b->entries*lastdim,sizeof(double));
-    histogram_error=calloc(b->entries*lastdim,sizeof(double));
-    histogram_hi=calloc(b->entries*lastdim,sizeof(double));
-    histogram_lo=calloc(b->entries*lastdim,sizeof(double));
+    histogram=calloc(books,sizeof(double *));
+    histogram_error=calloc(books,sizeof(double *));
+    histogram_errorsq=calloc(books,sizeof(double *));
+    histogram_hi=calloc(books,sizeof(double *));
+    histogram_lo=calloc(books,sizeof(double *));
   }else{
     fprintf(stderr,"Specify at least one codebook\n");
     exit(1);
   }
+
+  for(i=0;i<books;i++){
+    codebook *b=bs[i];
+    histogram[i]=calloc(b->entries,sizeof(double));
+    histogram_error[i]=calloc(b->entries*b->dim,sizeof(double));
+    histogram_errorsq[i]=calloc(b->entries*b->dim,sizeof(double));
+    histogram_hi[i]=calloc(b->entries*b->dim,sizeof(double));
+    histogram_lo[i]=calloc(b->entries*b->dim,sizeof(double));
+  }
 }
 
 static double _dist(int el,double *a, double *b){
@@ -87,219 +93,181 @@ static double _dist(int el,double *a, double *b){
   return acc;
 }
 
-void cell_spacing(codebook **b){
-  int i,j,k;
-  for(i=0;i<books;i++){
-    double min,max,mean=0.,meansq=0.;
-    codebook *c=b[i];
-
-    fprintf(stderr,"\nCell spacing for book %d:\n",i);
-    
-    /* minimum, maximum, mean, ms cell spacing */
-    for(j=0;j<c->c->entries;j++){
-      double localmin=-1.;
-      for(k=0;k<c->c->entries;k++){
-	double this=_dist(c->c->dim,_now(c,j),_now(c,k));
-	if(j!=k &&
-	   (localmin==-1 || this<localmin))
-	  localmin=this;
-      }
-      
-      if(j==0 || localmin<min)min=localmin;
-      if(j==0 || localmin>max)max=localmin;
-      mean+=sqrt(localmin);
-      meansq+=localmin;
+void cell_spacing(codebook *c){
+  int j,k;
+  double min,max,mean=0.,meansq=0.;
+  
+  /* minimum, maximum, mean, ms cell spacing */
+  for(j=0;j<c->c->entries;j++){
+    double localmin=-1.;
+    for(k=0;k<c->c->entries;k++){
+      double this=_dist(c->c->dim,_now(c,j),_now(c,k));
+      if(j!=k &&
+	 (localmin==-1 || this<localmin))
+	localmin=this;
     }
-
-    fprintf(stderr,"\tminimum cell spacing (closest side): %g\n",sqrt(min));
-    fprintf(stderr,"\tmaximum cell spacing (closest side): %g\n",sqrt(max));
-    fprintf(stderr,"\tmean closest side spacing: %g\n",mean/c->c->entries);
-    fprintf(stderr,"\tmean sq closest side spacing: %g\n",sqrt(meansq/c->c->entries));
+    
+    if(j==0 || localmin<min)min=localmin;
+    if(j==0 || localmin>max)max=localmin;
+    mean+=sqrt(localmin);
+    meansq+=localmin;
   }
+  
+  fprintf(stderr,"\tminimum cell spacing (closest side): %g\n",sqrt(min));
+  fprintf(stderr,"\tmaximum cell spacing (closest side): %g\n",sqrt(max));
+  fprintf(stderr,"\tmean closest side spacing: %g\n",mean/c->c->entries);
+  fprintf(stderr,"\tmean sq closest side spacing: %g\n",sqrt(meansq/c->c->entries));
 }
 
-void process_postprocess(codebook **b,char *basename){
-  int i,j,k;
+void process_postprocess(codebook **bs,char *basename){
+  int i,k,book;
   char *buffer=alloca(strlen(basename)+80);
-  codebook *bb=b[books-1];
 
-  fprintf(stderr,"Done.  Processed %ld data points:\n",
+  fprintf(stderr,"Done.  Processed %ld data points:\n\n",
 	  (long)count);
-  fprintf(stderr,"\tglobal mean amplitude: %g\n",
-	  meanamplitude_acc/(count*lastdim));
-  fprintf(stderr,"\tglobal mean squared amplitude: %g\n",
-	  sqrt(meanamplitudesq_acc/(count*lastdim)));
 
-  fprintf(stderr,"\tglobal mean error: %g\n",
-	  meanerror_acc/(count*lastdim));
-  fprintf(stderr,"\tglobal mean squared error: %g\n",
-	  sqrt(meanerrorsq_acc/(count*lastdim)));
-  fprintf(stderr,"\tglobal mean deviation: %g\n",
-	  meandev_acc/(count*lastdim));
+  fprintf(stderr,"Global statistics:******************\n\n");
+
+  fprintf(stderr,"\ttotal samples: %ld\n",(long)count);
+  fprintf(stderr,"\ttotal bits required to code: %ld\n",(long)bits);
+  fprintf(stderr,"\taverage bits per sample: %g\n\n",bits/count);
 
-  cell_spacing(b);
+  fprintf(stderr,"\tmean sample amplitude: %g\n",
+	  meanamplitude_acc/count);
+  fprintf(stderr,"\tmean squared sample amplitude: %g\n\n",
+	  sqrt(meanamplitudesq_acc/count));
 
-  {
+  fprintf(stderr,"\tmean code error: %g\n",
+	  meanerror_acc/count);
+  fprintf(stderr,"\tmean squared code error: %g\n\n",
+	  sqrt(meanerrorsq_acc/count));
+
+  for(book=0;book<books;book++){
     FILE *out;
+    codebook *b=bs[book];
+    int n=b->c->entries;
+    int dim=b->c->dim;
+
+    fprintf(stderr,"Book %d statistics:------------------\n",book);
 
-    sprintf(buffer,"%s-mse.m",basename);
+    cell_spacing(b);
+
+    sprintf(buffer,"%s-%d-mse.m",basename,book);
     out=fopen(buffer,"w");
     if(!out){
       fprintf(stderr,"Could not open file %s for writing\n",buffer);
       exit(1);
     }
-
-    for(i=0;i<bb->c->entries;i++){
-      for(k=0;k<bb->c->dim;k++){
-	fprintf(out,"%ld, %g, %g\n",
-		i*bb->c->dim+k,(bb->valuelist+i*bb->c->dim)[k],
-		sqrt((histogram_errorsq+i*bb->c->dim)[k]/histogram[i]));
+    
+    for(i=0;i<n;i++){
+      for(k=0;k<dim;k++){
+	fprintf(out,"%d, %g, %g\n",
+		i*dim+k,(b->valuelist+i*dim)[k],
+		sqrt((histogram_errorsq[book]+i*dim)[k]/histogram[book][i]));
       }
     }
     fclose(out);
-
-    sprintf(buffer,"%s-me.m",basename);
+      
+    sprintf(buffer,"%s-%d-me.m",basename,book);
     out=fopen(buffer,"w");
     if(!out){
       fprintf(stderr,"Could not open file %s for writing\n",buffer);
       exit(1);
     }
-
-    for(i=0;i<bb->c->entries;i++){
-      for(k=0;k<bb->c->dim;k++){
-	fprintf(out,"%ld, %g, %g\n",
-		i*bb->c->dim+k,(bb->valuelist+i*bb->c->dim)[k],
-		(histogram_error+i*bb->c->dim)[k]/histogram[i]);
+    
+    for(i=0;i<n;i++){
+      for(k=0;k<dim;k++){
+	fprintf(out,"%d, %g, %g\n",
+		i*dim+k,(b->valuelist+i*dim)[k],
+		(histogram_error[book]+i*dim)[k]/histogram[book][i]);
       }
     }
     fclose(out);
 
-    sprintf(buffer,"%s-worst.m",basename);
+    sprintf(buffer,"%s-%d-worst.m",basename,book);
     out=fopen(buffer,"w");
     if(!out){
       fprintf(stderr,"Could not open file %s for writing\n",buffer);
       exit(1);
     }
-
-    for(i=0;i<bb->c->entries;i++){
-      for(k=0;k<bb->c->dim;k++){
-	fprintf(out,"%ld, %g, %g, %g\n",
-		i*bb->c->dim+k,(bb->valuelist+i*bb->c->dim)[k],
-		(bb->valuelist+i*bb->c->dim)[k]+(histogram_lo+i*bb->c->dim)[k],
-		(bb->valuelist+i*bb->c->dim)[k]+(histogram_hi+i*bb->c->dim)[k]);
+    
+    for(i=0;i<n;i++){
+      for(k=0;k<dim;k++){
+	fprintf(out,"%d, %g, %g, %g\n",
+		i*dim+k,(b->valuelist+i*dim)[k],
+		(b->valuelist+i*dim)[k]+(histogram_lo[book]+i*dim)[k],
+		(b->valuelist+i*dim)[k]+(histogram_hi[book]+i*dim)[k]);
       }
     }
     fclose(out);
   }
+}
 
-  {
-    FILE *out;
-    long *index=alloca(sizeof(long)*bb->c->entries);
-    sprintf(buffer,"%s-distance.m",basename);
-    out=fopen(buffer,"w");
-    if(!out){
-      fprintf(stderr,"Could not open file %s for writing\n",buffer);
-      exit(1);
-    }
-    for(j=0;j<bb->c->entries;j++){
-      if(histogram[j])histogram_distance[j]/=histogram[j];
-      index[j]=j;
+void process_one(codebook *b,int book,double *a,int dim,int step,int addmul){
+  int j,entry;
+  double base=0.;
+  double amplitude=0.;
+
+  if(book==0)
+    for(j=0;j<dim;j++){
+      if(b->c->q_sequencep){
+	amplitude=a[j*step]-base;
+	base=a[j*step];
+      }else
+	amplitude=a[j*step];
+      meanamplitude_acc+=fabs(amplitude);
+      meanamplitudesq_acc+=amplitude*amplitude;
+      count++;
     }
 
-    qsort(index,bb->c->entries,sizeof(long),histerrsort);
+  entry=vorbis_book_besterror(b,a,step,addmul);
+  
+  histogram[book][entry]++;  
+  bits+=vorbis_book_codelen(b,entry);
+	  
+  for(j=0;j<dim;j++){
+    double error=a[j*step];
 
-    for(j=0;j<bb->c->entries;j++)
-      for(k=0;k<histogram[index[j]];k++)
-	fprintf(out,"%g,\n",histogram_distance[index[j]]);
-    fclose(out);
-		
+    if(book==books-1){
+      meanerror_acc+=fabs(error);
+      meanerrorsq_acc+=error*error;
+    }
+    histogram_errorsq[book][entry*dim+j]+=error*error;
+    histogram_error[book][entry*dim+j]+=fabs(error);
+    if(histogram[book][entry]==0 || histogram_hi[book][entry*dim+j]<error)
+      histogram_hi[book][entry*dim+j]=error;
+    if(histogram[book][entry]==0 || histogram_lo[book][entry*dim+j]>error)
+      histogram_lo[book][entry*dim+j]=error;
   }
 }
 
+
 void process_vector(codebook **bs,int *addmul,int inter,double *a,int n){
   int bi;
-  int i,j;
-  double base=0.;
-  double *work=alloca(sizeof(double)*n);
-  double amplitude=0.;
-
-  memcpy(work,a,sizeof(double)*n);
+  int i;
 
   for(bi=0;bi<books;bi++){
     codebook *b=bs[bi];
     int dim=b->dim;
-
+    
     if(inter){
-      for(i=0;i<n/dim;i++){
-	int entry=vorbis_book_besterror(b,work+i,n/dim,addmul[bi]);
-	
-	if(bi==books-1){
-	  double distance=0;
-	  histogram[entry]++;  
-	  
-	  for(j=0;j<dim;j++){
-	    double error=work[i+j*(n/dim)];
-	    histogram_errorsq[entry*dim+j]+=error*error;
-	    histogram_error[entry*dim+j]+=fabs(error);
-	    if(histogram[entry]==0 || histogram_hi[entry*dim+j]<error)
-	      histogram_hi[entry*dim+j]=error;
-	    if(histogram[entry]==0 || histogram_lo[entry*dim+j]>error)
-	      histogram_lo[entry*dim+j]=error;
-	    distance+=error*error;    
-	  }
-	  histogram_distance[entry]+=sqrt(distance);
-	}
-      }
+      for(i=0;i<n/dim;i++)
+	process_one(b,bi,a+i,dim,n/dim,addmul[bi]);
     }else{
-      for(i=0;i<=n-dim;i+=dim){
-	int entry=vorbis_book_besterror(b,work+i,1,addmul[bi]);
-	if(bi==books-1){
-	  double distance=0;
-	  histogram[entry]++;  
-	  
-	  for(j=0;j<dim;j++){
-	    double error=work[i+j];
-	    histogram_errorsq[entry*dim+j]+=error*error;
-	    histogram_error[entry*dim+j]+=fabs(error);
-	    if(histogram[entry]==0 || histogram_hi[entry*dim+j]<error)
-	      histogram_hi[entry*dim+j]=error;
-	    if(histogram[entry]==0 || histogram_lo[entry*dim+j]>error)
-	      histogram_lo[entry*dim+j]=error;
-	    distance+=error*error;    
-	  }
-	  histogram_distance[entry]+=sqrt(distance);
-	}
-      }
+      for(i=0;i<=n-dim;i+=dim)
+	process_one(b,bi,a+i,dim,1,addmul[bi]);
     }
   }
-
-  for(i=0;i<n;i++){
-    double error=work[i];
-    if(bs[0]->c->q_sequencep){
-      amplitude=a[i]-base;
-      base=a[i];
-    }else
-      amplitude=a[i];
-    
-    meanamplitude_acc+=fabs(amplitude);
-    meanamplitudesq_acc+=amplitude*amplitude;
-    meanerror_acc+=fabs(error);
-    meanerrorsq_acc+=error*error;
-    
-    if(amplitude)
-      meandev_acc+=fabs(error/amplitude);
-    else
-      meandev_acc+=fabs(error); /* yeah, yeah */
-  }
-
-  if((long)(count++)%100)spinnit("working.... lines: ",count);
+  
+  if((long)(count)%100)spinnit("working.... samples: ",count);
 }
 
 void process_usage(void){
   fprintf(stderr,
-	  "usage: vqmetrics +|*<codebook>.vqh [ +|*<codebook.vqh> ]... \n"
+	  "usage: vqmetrics [-i] +|*<codebook>.vqh [ +|*<codebook.vqh> ]... \n"
 	  "                 datafile.vqd [datafile.vqd]...\n\n"
-	  "       data can be taken on stdin.  Output goes to output files:\n"
+	  "       data can be taken on stdin.  -i indicates interleaved coding.\n"
+	  "       Output goes to output files:\n"
 	  "       basename-me.m:       gnuplot: mean error by entry value\n"
 	  "       basename-mse.m:      gnuplot: mean square error by entry value\n"
 	  "       basename-worst.m:    gnuplot: worst error by entry value\n"
diff --git a/vq/run.c b/vq/run.c
index d0fc7339..afe1ebf8 100644
--- a/vq/run.c
+++ b/vq/run.c
@@ -12,7 +12,7 @@
  ********************************************************************
 
  function: utility main for loading and operating on codebooks
- last mod: $Id: run.c,v 1.9.4.1 2000/04/21 16:35:40 xiphmont Exp $
+ last mod: $Id: run.c,v 1.9.4.2 2000/04/26 07:10:16 xiphmont Exp $
 
  ********************************************************************/
 
@@ -140,7 +140,7 @@ int main(int argc,char *argv[]){
 	}
 
 	reset_next_value();
-	line=get_line(in);
+	line=setup_line(in);
 	/* count cols before we start reading */
 	{
 	  char *temp=line;
@@ -161,8 +161,7 @@ int main(int argc,char *argv[]){
 	  /* ignores -s for now */
 	  process_vector(b,addmul,interleave,vec,cols);
 
-	  reset_next_value();
-	  line=get_line(in);
+	  line=setup_line(in);
 	}
 	fclose(in);
       }
author	Monty <xiphmont@xiph.org>	2000-04-26 07:10:16 +0000
committer	Monty <xiphmont@xiph.org>	2000-04-26 07:10:16 +0000
commit	de5ef5ea3d8ca2fffac1d247520aea82c7bf11f5 (patch)
tree	116c839881d24235e1cafed6409a6afed123345b
parent	097027d02df14cab9f0ffe712995aa7f7f16f392 (diff)
download	libvorbis-git-de5ef5ea3d8ca2fffac1d247520aea82c7bf11f5.tar.gz