svm-scale.c 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. #include <float.h>
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <ctype.h>
  5. #include <string.h>
  6. void exit_with_help()
  7. {
  8. printf(
  9. "Usage: svm-scale [options] data_filename\n"
  10. "options:\n"
  11. "-l lower : x scaling lower limit (default -1)\n"
  12. "-u upper : x scaling upper limit (default +1)\n"
  13. "-y y_lower y_upper : y scaling limits (default: no y scaling)\n"
  14. "-s save_filename : save scaling parameters to save_filename\n"
  15. "-r restore_filename : restore scaling parameters from restore_filename\n"
  16. );
  17. exit(1);
  18. }
  19. char *line;
  20. int max_line_len = 1024;
  21. double lower=-1.0,upper=1.0,y_lower,y_upper;
  22. int y_scaling = 0;
  23. double *feature_max;
  24. double *feature_min;
  25. double y_max = -DBL_MAX;
  26. double y_min = DBL_MAX;
  27. int max_index;
  28. #define max(x,y) ((x>y)?x:y)
  29. #define min(x,y) ((x<y)?x:y)
  30. void output_target(double value);
  31. void output(int index, double value);
  32. char* readline(FILE *input);
  33. int main(int argc,char **argv)
  34. {
  35. int i,index;
  36. FILE *fp;
  37. char *save_filename = NULL;
  38. char *restore_filename = NULL;
  39. for(i=1;i<argc;i++)
  40. {
  41. if(argv[i][0] != '-') break;
  42. ++i;
  43. switch(argv[i-1][1])
  44. {
  45. case 'l': lower = atof(argv[i]); break;
  46. case 'u': upper = atof(argv[i]); break;
  47. case 'y':
  48. y_lower = atof(argv[i]);
  49. ++i;
  50. y_upper = atof(argv[i]);
  51. y_scaling = 1;
  52. break;
  53. case 's': save_filename = argv[i]; break;
  54. case 'r': restore_filename = argv[i]; break;
  55. default:
  56. fprintf(stderr,"unknown option\n");
  57. exit_with_help();
  58. }
  59. }
  60. if(!(upper > lower) || (y_scaling && !(y_upper > y_lower)))
  61. {
  62. fprintf(stderr,"inconsistent lower/upper specification\n");
  63. exit(1);
  64. }
  65. if(argc != i+1)
  66. exit_with_help();
  67. fp=fopen(argv[i],"r");
  68. if(fp==NULL)
  69. {
  70. fprintf(stderr,"can't open file %s\n", argv[i]);
  71. exit(1);
  72. }
  73. line = (char *) malloc(max_line_len*sizeof(char));
  74. #define SKIP_TARGET\
  75. while(isspace(*p)) ++p;\
  76. while(!isspace(*p)) ++p;
  77. #define SKIP_ELEMENT\
  78. while(*p!=':') ++p;\
  79. ++p;\
  80. while(isspace(*p)) ++p;\
  81. while(*p && !isspace(*p)) ++p;
  82. /* assumption: min index of attributes is 1 */
  83. /* pass 1: find out max index of attributes */
  84. max_index = 0;
  85. while(readline(fp)!=NULL)
  86. {
  87. char *p=line;
  88. SKIP_TARGET
  89. while(sscanf(p,"%d:%*f",&index)==1)
  90. {
  91. max_index = max(max_index, index);
  92. SKIP_ELEMENT
  93. }
  94. }
  95. feature_max = (double *)malloc((max_index+1)* sizeof(double));
  96. feature_min = (double *)malloc((max_index+1)* sizeof(double));
  97. if(feature_max == NULL || feature_min == NULL)
  98. {
  99. fprintf(stderr,"can't allocate enough memory\n");
  100. exit(1);
  101. }
  102. for(i=0;i<=max_index;i++)
  103. {
  104. feature_max[i]=-DBL_MAX;
  105. feature_min[i]=DBL_MAX;
  106. }
  107. rewind(fp);
  108. /* pass 2: find out min/max value */
  109. while(readline(fp)!=NULL)
  110. {
  111. char *p=line;
  112. int next_index=1;
  113. double target;
  114. double value;
  115. sscanf(p,"%lf",&target);
  116. y_max = max(y_max,target);
  117. y_min = min(y_min,target);
  118. SKIP_TARGET
  119. while(sscanf(p,"%d:%lf",&index,&value)==2)
  120. {
  121. for(i=next_index;i<index;i++)
  122. {
  123. feature_max[i]=max(feature_max[i],0);
  124. feature_min[i]=min(feature_min[i],0);
  125. }
  126. feature_max[index]=max(feature_max[index],value);
  127. feature_min[index]=min(feature_min[index],value);
  128. SKIP_ELEMENT
  129. next_index=index+1;
  130. }
  131. for(i=next_index;i<=max_index;i++)
  132. {
  133. feature_max[i]=max(feature_max[i],0);
  134. feature_min[i]=min(feature_min[i],0);
  135. }
  136. }
  137. rewind(fp);
  138. /* pass 2.5: save/restore feature_min/feature_max */
  139. if(restore_filename)
  140. {
  141. FILE *fp_restore = fopen(restore_filename,"r");
  142. int idx, c;
  143. double fmin, fmax;
  144. if(fp_restore==NULL)
  145. {
  146. fprintf(stderr,"can't open file %s\n", restore_filename);
  147. exit(1);
  148. }
  149. if((c = fgetc(fp_restore)) == 'y')
  150. {
  151. fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper);
  152. fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max);
  153. y_scaling = 1;
  154. }
  155. else
  156. ungetc(c, fp_restore);
  157. if (fgetc(fp_restore) == 'x') {
  158. fscanf(fp_restore, "%lf %lf\n", &lower, &upper);
  159. while(fscanf(fp_restore,"%d %lf %lf\n",&idx,&fmin,&fmax)==3)
  160. {
  161. if(idx<=max_index)
  162. {
  163. feature_min[idx] = fmin;
  164. feature_max[idx] = fmax;
  165. }
  166. }
  167. }
  168. fclose(fp_restore);
  169. }
  170. if(save_filename)
  171. {
  172. FILE *fp_save = fopen(save_filename,"w");
  173. if(fp_save==NULL)
  174. {
  175. fprintf(stderr,"can't open file %s\n", save_filename);
  176. exit(1);
  177. }
  178. if(y_scaling)
  179. {
  180. fprintf(fp_save, "y\n");
  181. fprintf(fp_save, "%.16g %.16g\n", y_lower, y_upper);
  182. fprintf(fp_save, "%.16g %.16g\n", y_min, y_max);
  183. }
  184. fprintf(fp_save, "x\n");
  185. fprintf(fp_save, "%.16g %.16g\n", lower, upper);
  186. for(i=1;i<=max_index;i++)
  187. {
  188. if(feature_min[i]!=feature_max[i])
  189. fprintf(fp_save,"%d %.16g %.16g\n",i,feature_min[i],feature_max[i]);
  190. }
  191. fclose(fp_save);
  192. }
  193. /* pass 3: scale */
  194. while(readline(fp)!=NULL)
  195. {
  196. char *p=line;
  197. int next_index=1;
  198. int index;
  199. double target;
  200. double value;
  201. sscanf(p,"%lf",&target);
  202. output_target(target);
  203. SKIP_TARGET
  204. while(sscanf(p,"%d:%lf",&index,&value)==2)
  205. {
  206. for(i=next_index;i<index;i++)
  207. output(i,0);
  208. output(index,value);
  209. SKIP_ELEMENT
  210. next_index=index+1;
  211. }
  212. for(i=next_index;i<=max_index;i++)
  213. output(i,0);
  214. printf("\n");
  215. }
  216. free(line);
  217. free(feature_max);
  218. free(feature_min);
  219. fclose(fp);
  220. return 0;
  221. }
  222. char* readline(FILE *input)
  223. {
  224. int len;
  225. if(fgets(line,max_line_len,input) == NULL)
  226. return NULL;
  227. while(strrchr(line,'\n') == NULL)
  228. {
  229. max_line_len *= 2;
  230. line = (char *) realloc(line, max_line_len);
  231. len = strlen(line);
  232. if(fgets(line+len,max_line_len-len,input) == NULL)
  233. break;
  234. }
  235. return line;
  236. }
  237. void output_target(double value)
  238. {
  239. if(y_scaling)
  240. {
  241. if(value == y_min)
  242. value = y_lower;
  243. else if(value == y_max)
  244. value = y_upper;
  245. else value = y_lower + (y_upper-y_lower) *
  246. (value - y_min)/(y_max-y_min);
  247. }
  248. printf("%g ",value);
  249. }
  250. void output(int index, double value)
  251. {
  252. /* skip single-valued attribute */
  253. if(feature_max[index] == feature_min[index])
  254. return;
  255. if(value == feature_min[index])
  256. value = lower;
  257. else if(value == feature_max[index])
  258. value = upper;
  259. else
  260. value = lower + (upper-lower) *
  261. (value-feature_min[index])/
  262. (feature_max[index]-feature_min[index]);
  263. if(value != 0)
  264. printf("%d:%g ",index, value);
  265. }