summaryrefslogtreecommitdiff
path: root/substrings.c
diff options
context:
space:
mode:
authorCaolán McNamara <caolanm@redhat.com>2010-03-04 12:13:53 +0000
committerCaolán McNamara <caolanm@redhat.com>2010-03-04 12:13:53 +0000
commit21127cc8493a68d4fe9adbb71377b469b4f2b550 (patch)
treee1e70c8ae8b41fea06c791f5221ef85bb3c3ebf2 /substrings.c
downloadhyphen-21127cc8493a68d4fe9adbb71377b469b4f2b550.tar.gz
Initia import
Diffstat (limited to 'substrings.c')
-rw-r--r--substrings.c262
1 files changed, 262 insertions, 0 deletions
diff --git a/substrings.c b/substrings.c
new file mode 100644
index 0000000..1922db1
--- /dev/null
+++ b/substrings.c
@@ -0,0 +1,262 @@
+//
+// A utility for finding substring embeddings in patterns
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define MAXPATHS (256*1024)
+
+//
+//
+static void die(
+ const char*msg
+) {
+ fprintf(stderr,"%s\n",msg);
+ exit(1);
+}
+
+
+// Finds the index of an entry, only used on xxx_key arrays
+// Caveat: the table has to be sorted
+static int find_in(
+ char *tab[],
+ int max,
+ const char *pat
+) {
+ int left=0, right=max-1;
+ while (left <= right) {
+ int mid = ((right-left)/2)+left;
+ int v = strcmp(pat,tab[mid]);
+ if (v>0) {
+ left = mid + 1;
+ } else if (v<0) {
+ right = mid -1;
+ } else {
+ return mid;
+ }
+ }
+ return -1;
+}
+
+
+// used by partition (which is used by qsort_arr)
+//
+static void swap2(
+ char *a[],
+ char *b[],
+ int i,
+ int j
+) {
+ if (i==j) return;
+ char*v;
+ v=a[i]; a[i]=a[j]; a[j]=v;
+ v=b[i]; b[i]=b[j]; b[j]=v;
+}
+
+
+// used by qsort_arr
+//
+static int partition(
+ char *a[],
+ char *b[],
+ int left,
+ int right,
+ int p
+) {
+ const char *pivotValue = a[p];
+ int i;
+ swap2(a,b,p,right); // Move pivot to end
+ p = left;
+ for (i=left; i<right; i++) {
+ if (strcmp(a[i],pivotValue)<=0) {
+ swap2(a,b,p,i);
+ p++;
+ }
+ }
+ swap2(a,b,right,p); // Move pivot to its final place
+ return p;
+}
+
+
+//
+//
+static void qsort_arr(
+ char *a[],
+ char *b[],
+ int left,
+ int right
+) {
+ while (right > left) {
+ int p = left + (right-left)/2; //select a pivot
+ p = partition(a,b, left, right, p);
+ if ((p-1) - left < right - (p+1)) {
+ qsort_arr(a,b, left, p-1);
+ left = p+1;
+ } else {
+ qsort_arr(a,b, p+1, right);
+ right = p-1;
+ }
+ }
+}
+
+
+// Removes extra '0' entries from the string
+//
+static char* compact(
+ char *expr
+) {
+ int l=strlen(expr);
+ int i,j;
+ for (i=0,j=0; i<l; i++) {
+ if (expr[i]!='0') expr[j++] = expr[i];
+ }
+ expr[j]=0;
+ return expr;
+}
+
+
+// convert 'n1im' to 0n1i0m0 expressed as a string
+//
+static void expand(
+ char *expr,
+ const char *pat,
+ int l
+) {
+ int el = 0;
+ char last = '.';
+ int i;
+ for (i=0; i<l; i++) {
+ char c = pat[i];
+ if ( (last<'0' || last>'9')
+ && (c <'0' || c >'9')
+ ) {
+ expr[el++] = '0';
+ }
+ expr[el++] = c;
+ last = c;
+ }
+ if (last<'0' || last>'9') expr[el++] = '0';
+ expr[el]=0;
+}
+
+
+// Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der
+// The second pattern needs to be a right side match of the first
+// (modulo digits)
+static char *combine(
+ char *expr,
+ const char *subexpr
+) {
+ int l1 = strlen(expr);
+ int l2 = strlen(subexpr);
+ int off = l1-l2;
+ int j;
+ // this works also for utf8 sequences because the substring is identical
+ // to the last substring-length bytes of expr except for the (single byte)
+ // hyphenation encoders
+ for (j=0; j<l2; j++) {
+ if (subexpr[j]>expr[off+j]) {
+ expr[off+j] = subexpr[j];
+ }
+ }
+ return expr;
+}
+
+
+//
+//
+int main(int argc, const char* argv[]) {
+ FILE *in, *out;
+ char *pattab_key[MAXPATHS];
+ char *pattab_val[MAXPATHS];
+ int patterns = 0;
+ char *newpattab_key[MAXPATHS];
+ char *newpattab_val[MAXPATHS];
+ int newpatterns = 0;
+ char format[132]; // 64+65+newline+zero+spare
+ int p;
+ if (argc!=3) die("Usage: <orig-file> <new-file>\n");
+ if ((in = fopen(argv[1],"r"))==NULL) die("Could not read input");
+ if ((out = fopen(argv[2],"w"))==NULL) die("Could not create output");
+ // read all patterns and split in pure text (_key) & expanded patterns (_val)
+ while(fgets(format,132,in)) {
+ int l = strlen(format);
+ if (format[l-1]=='\n') { l--; format[l]=0; } // Chomp
+ if (format[0]=='%' || format[0]==0) {
+ // skip
+ } else {
+ if (format[l-1]=='%') {
+ l--;
+ format[l] = 0; // remove '%'
+ }
+ int i,j;
+ char *pat = (char*) malloc(l+1);
+ char *org = (char*) malloc(l*2+1);
+ expand(org,format,l);
+ // remove hyphenation encoders (digits) from pat
+ for (i=0,j=0; i<l; i++) {
+ // odd, but utf-8 proof
+ char c = format[i];
+ if (c<'0' || c>'9') pat[j++]=c;
+ }
+ pat[j]=0;
+ p = patterns;
+ pattab_key[patterns] = pat;
+ pattab_val[patterns++] = org;
+ if (patterns>MAXPATHS) die("to many base patterns");
+ }
+ }
+ fclose(in);
+ // As we use binairy search, make sure it is sorted
+ qsort_arr(pattab_key,pattab_val,0,patterns-1);
+
+ for (p=0; p<patterns; p++) {
+ char *pat = pattab_key[p];
+ int patsize = strlen(pat);
+ int j,l;
+ for (l=1; l<=patsize; l++) {
+ for (j=1; j<=l; j++) {
+ int i = l-j;
+ int subpat_ndx;
+ char subpat[132];
+ strncpy(subpat,pat+i,j); subpat[j]=0;
+ if ((subpat_ndx = find_in(pattab_key,patterns,subpat))>=0) {
+ int newpat_ndx;
+ char *newpat=malloc(l+1);
+ //printf("%s is embedded in %s\n",pattab_val[subpat_ndx],pattab_val[p]);
+ strncpy(newpat, pat+0,l); newpat[l]=0;
+ if ((newpat_ndx = find_in(newpattab_key,newpatterns,newpat))<0) {
+ char *neworg = malloc(132); // TODO: compute exact length
+ expand(neworg,newpat,l);
+ newpattab_key[newpatterns] = newpat;
+ newpattab_val[newpatterns++] = combine(neworg,pattab_val[subpat_ndx]);
+ if (newpatterns>MAXPATHS) die("to many new patterns");
+ //printf("%*.*s|%*.*s[%s] (%s|%s) = %s\n",i,i,pat,j,j,pat+i,pat+i+j,pattab_val[p],pattab_val[subpat_ndx],neworg);
+ } else {
+ free(newpat);
+ newpattab_val[newpat_ndx] = combine(
+ newpattab_val[newpat_ndx], pattab_val[subpat_ndx] );
+ }
+ }
+ }
+ }
+ }
+
+ /* for some tiny extra speed, one could forget the free()s
+ * as the memory is freed anyway on exit().
+ * However, the gain is minimal and now the code can be cleanly
+ * incorporated into other code */
+ for (p=0; p<newpatterns; p++) {
+ fprintf(out,"%s\n",compact(newpattab_val[p]));
+ free(newpattab_key[p]);
+ free(newpattab_val[p]);
+ }
+ fclose(out);
+
+ for (p=0; p<patterns; p++) {
+ free(pattab_key[p]);
+ free(pattab_val[p]);
+ }
+ return 0;
+}