/* Copyright (C) 1999 Greg Schohn - gcs@jprc.com */ /* ******************** svm_fisher.c ******************* * An implementation of the naive bayes fisher kernel. * This is still a work very much in progress, ridden with * numerical precision problems. */ #include static bow_barrel *rainbow_nb_barrel; static double fisher_norm0; static int total_num_words_occurences; static double *dIi; /* approximate diagonal inverse information matrix for classes */ static double *dIij; /* approximate diagonal inverse information matrix for class-words */ typedef struct _NPair { int N; int index; } NPair; void svm_set_fisher_barrel_weights(bow_wv **docs, int ndocs) { int i,j; total_num_words_occurences = 0; for (i=0; inormalizer = 1.0; for (j=0; jnum_entries; j++) { docs[i]->entry[j].weight = (float) docs[i]->entry[j].count; total_num_words_occurences += docs[i]->entry[j].count; } } } double svm_kernel_fisher(bow_wv *wv1, bow_wv *wv2) { bow_cdoc *cd; int max_entries; /* max number of elements that can be in both */ int nclasses; int nwords; NPair *Nvector; double rval; double tmp; bow_we *v1, *v2; double t2, pci; int i, j, k; nwords = total_num_words_occurences; nclasses = bow_barrel_num_classes(rainbow_nb_barrel); max_entries = MIN(wv1->num_entries, wv2->num_entries); Nvector = (NPair *) alloca(max_entries*sizeof(NPair)); v1 = wv1->entry; v2 = wv2->entry; /* compute the N(wi,X1)*N(wi,X2) vector */ for (i=j=k=0; (i v2[j].wi) { j++; } else if (v1[i].wi < v2[j].wi) { i++; } else { Nvector[k].index = v1[i].wi; Nvector[k].N = (v1[i].count)*(v2[j].count); k++; i++; j++; } } max_entries = k; rval = 0.0; /* now we have all of the P(X*|C*) terms - in ascending order with * regards to class index */ for (i=0; ientry; for (h=0; hnum_entries; h++) { double sum, t; bow_dv *dv = bow_wi2dvf_dv(rainbow_nb_barrel->wi2dvf, v[h].wi); assert(dv); /* sum up the number of words that appeared in all of the classes */ for (k=0, sum=0.0; klength; k++) { sum += dv->entry[k].weight; } p_w = log(sum/nwords)*v[h].weight; t = (double) bow_naivebayes_pr_wi_ci (rainbow_nb_barrel, v[h].wi, i, -1, 0.0, 0.0, NULL, NULL); t = log(t) * v[h].weight; t2 += t - p_w; assert(finite(t2)); //printf("P(w%d|c%d)^%f, p_w^%f\n", v[h].wi, k, t, p_w); } } } cd = GET_CDOC_ARRAY_EL(rainbow_nb_barrel,i); pci = cd->prior; rval += exp(t2 + log(dIi[i] + (tmp*pci*pci))); assert(finite(rval)); } //rval = exp(rval); printf("kernel=%f\n",rval); return rval; } void svm_setup_fisher(bow_barrel *old_barrel, bow_wv **docs, int nclasses, int ndocs) { double *PXk, PX; int i,j,k; rainbow_method *tmp = old_barrel->method; old_barrel->method = &bow_method_naivebayes; rainbow_nb_barrel = bow_barrel_new_vpc_merge_then_weight (old_barrel); old_barrel->method = tmp; /* set some global variables that naivebayes.c uses */ naivebayes_score_returns_doc_pr = 1; naivebayes_score_unsorted = 1; fprintf(stderr, "Finding maximum kernel value for normalizing\n"); i = bow_num_words()*nclasses; dIi = (double *) malloc(sizeof(double)*nclasses); PXk = (double *) malloc(sizeof(double)*nclasses); dIij = (double *) malloc(sizeof(double)*i); for (j=0; j max_lpr) max_lpr = scores[k].weight; } for (k=0; kprior * exp(scores[k].weight - max_lpr); /* hacky-hacky-hacky-smoothing */ #define THRESH 1e-1 if (PXk[k] < THRESH) { PXk[k] = THRESH; printf("underflow on P(X%d|C%d) - setting to small val\n",i,k); fflush(stdout); } PX += PXk[k]; assert(finite(PXk[k]) && PXk[k] != 0.0); } free(scores); /* compute term for Iij - d/d-theta_ij * log(P(X|theta)) */ for (j=0; jnum_entries; j++) { for (k=0; kentry[j].wi, k, -1, 0.0, 0.0, NULL, NULL); dIij[k*nclasses + docs[i]->entry[j].wi] += ((((docs[i]->entry[j].count * PXk[k]) /tmp) /PX) * (((docs[i]->entry[j].count * PXk[k]) /tmp) /PX)); } } /* compute term for Ii - d/d-theta_i * log(P(X|theta)) */ for (k=0; k