(I will try my best to make this note clearer. We mainly focus on solve_c_svc in this note)
We mainly focus on solve_c_svc in this note.
Our goal:
mindB 12dTB∇2f(αk)dB+∇f(αk)TBdB
s.t. yTBdB=0
Core function of training: solve_c_svc.
It will call the function:
s.Solve(l, SVC_Q(*prob,*param,y), minus_ones, y, alpha, Cp, Cn, param->eps, si, param->shrinking);
Algotirhm 2 of WSS3 (An SMO-type decomposition method using WSS3)
After the initialization (trivial, we will discuss later), the first step is to select the working set B={i,j}. The chosen i and j most violate the optimally condition. Thus it is a natural choice . We called it “maximal violating pair”.
The pair satisfies in WSS1:
i∈argmaxt{−yt∇f(αk)t|t∈Iup(αk)}
j∈argmint{−yt∇f(αk)t|t∈Ilow(αk)}
where
Iup(α)≡{t|αt<C,yt=1orαt>0,yt=−1}, and
Ilow(α)≡{t|αt<C,yt=−1orαt>0,yt=1}
while in WSS3 j is different:
j∈argmint{−b2ita¯it|t∈Ilow(αk),−yt∇f(αk)t<−yi∇f(αk)i}
Detailed proof of it is shown in Fan(2005) Theorem 3.
if(select_working_set(i,j)!=0)
{
// reconstruct the whole gradient
reconstruct_gradient();
// reset active set size and check
active_size = l;
info("*");
if(select_working_set(i,j)!=0) //select_working_set: 1 optimal
break;
else
counter = 1; // do shrinking next iteration
}
Working set selection subroutine (WSS3):
Note that we only discuss the case of yj=1, while the other case is similar. Also note that there is no yj because it is 1 already which is reflected by the sign.
Note that:
i∈argmaxt{−yt∇f(αk)t|t∈Iup(αk)}
where Iup(α)≡{t|αt<C,yt=1orαt>0,yt=−1}
We wanna find a t satisfying its −yt∇f(αk)t is maximum.
Select i:
// return i,j such that
// i: maximizes -y_i * grad(f)_i, i in I_up(\alpha)
// j: minimizes the decrease of obj value
// (if quadratic coefficeint <= 0, replace it with tau)
// -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha)
for(int t=0;t<active_size;t++)
if(y[t]==+1)
{
if(!is_upper_bound(t))
if(-G[t] >= Gmax)
{
Gmax = -G[t];
Gmax_idx = t;
}
}
else
{
if(!is_lower_bound(t))
if(G[t] >= Gmax)
{
Gmax = G[t];
Gmax_idx = t;
}
}
Select j:
quad_coef is aij. grad_diff is bij.
if (!is_lower_bound(j))
{
double grad_diff=Gmax+G[j]; //y[t]==1
if (G[j] >= Gmax2)
Gmax2 = G[j];
if (grad_diff > 0)
{
double obj_diff;
double quad_coef = QD[i]+QD[j]-2.0*y[i]*Q_i[j];
//HERE y[j]==1 and quad_coef is "a" in (Fan2005) Q_ii+Q_tt-yiyjQ_it
if (quad_coef > 0)
obj_diff = -(grad_diff*grad_diff)/quad_coef;
else
obj_diff = -(grad_diff*grad_diff)/TAU;
if (obj_diff <= obj_diff_min)
{
Gmin_idx=j;
obj_diff_min = obj_diff;
}
}
}
After choosing the i and j for the set B={i,j}, we update the αi and αj:
const Qfloat *Q_i = Q.get_Q(i,active_size);
const Qfloat *Q_j = Q.get_Q(j,active_size);
double C_i = get_C(i);
double C_j = get_C(j);
double old_alpha_i = alpha[i];
double old_alpha_j = alpha[j];
if(y[i]!=y[j])
{
double quad_coef = QD[i]+QD[j]+2*Q_i[j];
if (quad_coef <= 0)
quad_coef = TAU;
double delta = (-G[i]-G[j])/quad_coef;
double diff = alpha[i] - alpha[j];
alpha[i] += delta;
alpha[j] += delta;
if(diff > 0)
{
if(alpha[j] < 0)
{
alpha[j] = 0;
alpha[i] = diff;
}
}
else
{
if(alpha[i] < 0)
{
alpha[i] = 0;
alpha[j] = -diff;
}
}
if(diff > C_i - C_j)
{
if(alpha[i] > C_i)
{
alpha[i] = C_i;
alpha[j] = C_i - diff;
}
}
else
{
if(alpha[j] > C_j)
{
alpha[j] = C_j;
alpha[i] = C_j + diff;
}
}
}
else
{
double quad_coef = QD[i]+QD[j]-2*Q_i[j];
if (quad_coef <= 0)
quad_coef = TAU;
double delta = (G[i]-G[j])/quad_coef;
double sum = alpha[i] + alpha[j];
alpha[i] -= delta;
alpha[j] += delta;
if(sum > C_i)
{
if(alpha[i] > C_i)
{
alpha[i] = C_i;
alpha[j] = sum - C_i;
}
}
else
{
if(alpha[j] < 0)
{
alpha[j] = 0;
alpha[i] = sum;
}
}
if(sum > C_j)
{
if(alpha[j] > C_j)
{
alpha[j] = C_j;
alpha[i] = sum - C_j;
}
}
else
{
if(alpha[i] < 0)
{
alpha[i] = 0;
alpha[j] = sum;
}
}
}
Then updating the gradient G[i]: ∇f(αk+1)i:
double delta_alpha_i = alpha[i] - old_alpha_i;
double delta_alpha_j = alpha[j] - old_alpha_j;
for(int k=0;k<active_size;k++)
{
G[k] += Q_i[k]*delta_alpha_i + Q_j[k]*delta_alpha_j;
}
QBNαkN+pB=∇Bf(αk)−QBBαkB
∇f(αk+1)=∇f(αk)+Q:,B(αk+1B−αkB)
where Q:,B is the sub-matrix of Q including columns in B. After the sub-problem is solved, ∇f(αk+1) is acquired. Therefore, LIBSVM maintains the gradient throughout the decomposition method.
bool ui = is_upper_bound(i);
bool uj = is_upper_bound(j);
update_alpha_status(i);
update_alpha_status(j);
int k;
if(ui != is_upper_bound(i))
{
Q_i = Q.get_Q(i,l);
if(ui)
for(k=0;k<l;k++)
G_bar[k] -= C_i * Q_i[k];
else
for(k=0;k<l;k++)
G_bar[k] += C_i * Q_i[k];
}
if(uj != is_upper_bound(j))
{
Q_j = Q.get_Q(j,l);
if(uj)
for(k=0;k<l;k++)
G_bar[k] -= C_j * Q_j[k];
else
for(k=0;k<l;k++)
G_bar[k] += C_j * Q_j[k];
}
Opssss, we are almost there:
// calculate rho
si->rho = calculate_rho();
// calculate objective value
{
double v = 0;
int i;
for(i=0;i<l;i++)
v += alpha[i] * (G[i] + p[i]);
si->obj = v/2;
}
// put back the solution
{
for(int i=0;i<l;i++)
alpha_[active_set[i]] = alpha[i];
}
si->upper_bound_p = Cp;
si->upper_bound_n = Cn;
INPUT & OUTPUT
After the training by svm-train, the file “*.model” will show up. In the svm.cpp, where almost all useful functions locate, the function of “svm_save_model” is responsible for the output. The file looks like:
svm_type c_svc
kernel_type rbf
gamma 0.25
nr_class 2
total_sv 3053
rho -0.495266
label 1 0
nr_sv 1996 1057
SV
0.2759747847329309 1:26.173 2:58.867 3:-0.1894697 4:125.1225
0.5042408472321596 1:57.07397 2:221.404 3:0.08607959 4:122.9114
0.5047343497502841 1:17.259 2:173.436 3:-0.1298053 4:125.0318
0.450868920775192 1:21.7794 2:124.9531 3:0.1538853 4:152.715
0.5049158714985423 1:91.33997 2:293.5699 3:0.1423918 4:160.5402
0.5048770796985661 1:55.375 2:179.222 3:0.1654953 4:111.2273
0.5047914121992919 1:29.562 2:191.357 3:0.09901439 4:103.4076
The first column: y*alpha where α are dual solution (#-1 coefficients).
Data following: each line represents a support vector.
for(int i=0;i<l;i++)
{
for(int j=0;j<nr_class-1;j++)
fprintf(fp, "%.16g ",sv_coef[j][i]); //the first column
const svm_node *p = SV[i];
if(param.kernel_type == PRECOMPUTED)
fprintf(fp,"0:%d ",(int)(p->value));
else
while(p->index != -1)
{
fprintf(fp,"%d:%.8g ",p->index,p->value); // index: 1-4; sv value
p++;
}
fprintf(fp, "\n");
}
One part from the svm_train function:
decision_function f = svm_train_one(prob,param,0,0);
//receive the f from svm_train_one
int i;
for(i=0;i<prob->l;i++)
if(fabs(f.alpha[i]) > 0) ++nSV;
int j = 0;
for(i=0;i<prob->l;i++)
if(fabs(f.alpha[i]) > 0)
{
model->SV[j] = prob->x[i];
model->sv_coef[0][j] = f.alpha[i];
model->sv_indices[j] = i+1;
++j;
}
In svm_train, the function first declare a new svm_model and at last return it.
svm_model *model = Malloc(svm_model,1);
In the main(), model will receive it and save its data to the file:
model = svm_train(&prob,¶m);
if(svm_save_model(model_file_name,model))
Read problem from the file. Mainly are the
void read_problem(const char *filename)
{
//...
prob.y = Malloc(double,prob.l);
prob.x = Malloc(struct svm_node *,prob.l);
x_space = Malloc(struct svm_node,elements);
// elements are # of values
prob.x[i] = &x_space[j];
// x_space is a svm_node (index,value); prob.x is a svm_node *.
// prob.x[i][0].value is x_space[0].value
// prob.y is lable
//...
x_space[j].index = (int) strtol(idx,&endptr,10);
x_space[j].value = strtod(val,&endptr);
}
2 IMPORTANT classes, actually 2+2:
struct svm_node
{
int index;
double value;
};
struct svm_problem
{
int l;
double *y;
struct svm_node **x;
};
struct svm_parameter
{
int svm_type;
int kernel_type;
int degree; /* for poly */
double gamma; /* for poly/rbf/sigmoid */
double coef0; /* for poly/sigmoid */
/* these are for training only */
double cache_size; /* in MB */
double eps; /* stopping criteria */
double C; /* for C_SVC, EPSILON_SVR and NU_SVR */
int nr_weight; /* for C_SVC */
int *weight_label; /* for C_SVC */
double* weight; /* for C_SVC */
double nu; /* for NU_SVC, ONE_CLASS, and NU_SVR */
double p; /* for EPSILON_SVR */
int shrinking; /* use the shrinking heuristics */
int probability; /* do probability estimates */
};
struct svm_model
{
struct svm_parameter param; /* parameter */
int nr_class; /* number of classes, = 2 in regression/one class svm */
int l; /* total #SV */
struct svm_node **SV; /* SVs (SV[l]) */
double **sv_coef; /* coefficients for SVs in decision functions (sv_coef[k-1][l]) */
double *rho; /* constants in decision functions (rho[k*(k-1)/2]) */
double *probA; /* pariwise probability information */
double *probB;
int *sv_indices; /* sv_indices[0,...,nSV-1] are values in [1,...,num_traning_data] to indicate SVs in the training set */
/* for classification only */
int *label; /* label of each class (label[k]) */
int *nSV; /* number of SVs for each class (nSV[k]) */
/* nSV[0] + nSV[1] + ... + nSV[k-1] = l */
/* XXX */
int free_sv; /* 1 if svm_model is created by svm_load_model*/
/* 0 if svm_model is created by svm_train */
};
Citations:
[1] http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf
[2]“Working set selection using second order information for traning support vector machines”, Rong-En Fan, etc., 2005.