Tokenize.test.js 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. import {tokenize, toksToTfIdfVector} from "lib/Tokenize.jsm";
  2. const EPSILON = 0.00001;
  3. describe("TF-IDF Term Vectorizer", () => {
  4. describe("#tokenize", () => {
  5. let testCases = [
  6. {input: "HELLO there", expected: ["hello", "there"]},
  7. {input: "blah,,,blah,blah", expected: ["blah", "blah", "blah"]},
  8. {input: "Call Jenny: 967-5309", expected: ["call", "jenny", "967", "5309"]},
  9. {input: "Yo(what)[[hello]]{{jim}}}bob{1:2:1+2=$3", expected: ["yo", "what", "hello", "jim", "bob", "1", "2", "1", "2", "3"]},
  10. {input: "čÄfė 80's", expected: ["čäfė", "80", "s"]},
  11. {input: "我知道很多东西。", expected: ["我知道很多东西"]},
  12. ];
  13. let checkTokenization = tc => {
  14. it(`${tc.input} should tokenize to ${tc.expected}`, () => {
  15. assert.deepEqual(tc.expected, tokenize(tc.input));
  16. });
  17. };
  18. for (let i = 0; i < testCases.length; i++) {
  19. checkTokenization(testCases[i]);
  20. }
  21. });
  22. describe("#tfidf", () => {
  23. let vocab_idfs = {
  24. deal: [221, 5.5058519847862275],
  25. easy: [269, 5.5058519847862275],
  26. tanks: [867, 5.6011621645905520],
  27. sites: [792, 5.9578371085292850],
  28. care: [153, 5.9578371085292850],
  29. needs: [596, 5.8243057159047620],
  30. finally: [334, 5.7065226802483790],
  31. };
  32. let testCases = [
  33. {
  34. input: "Finally! Easy care for your tanks!",
  35. expected: {
  36. finally: [334, 0.50098162958537610],
  37. easy: [269, 0.48336453811728713],
  38. care: [153, 0.52304478763682270],
  39. tanks: [867, 0.49173191907236774],
  40. },
  41. },
  42. {
  43. input: "Easy easy EASY",
  44. expected: {easy: [269, 1.0]},
  45. },
  46. {
  47. input: "Easy easy care",
  48. expected: {
  49. easy: [269, 0.8795205218806832],
  50. care: [153, 0.4758609582543317],
  51. },
  52. },
  53. {
  54. input: "easy care",
  55. expected: {
  56. easy: [269, 0.6786999710383944],
  57. care: [153, 0.7344156515982504],
  58. },
  59. },
  60. {
  61. input: "这个空间故意留空。",
  62. expected: { /* This space is left intentionally blank. */ },
  63. },
  64. ];
  65. let checkTokenGeneration = tc => {
  66. describe(`${tc.input} should have only vocabulary tokens`, () => {
  67. let actual = toksToTfIdfVector(tokenize(tc.input), vocab_idfs);
  68. it(`${tc.input} should generate exactly ${Object.keys(tc.expected)}`, () => {
  69. let seen = {};
  70. Object.keys(actual).forEach(actualTok => {
  71. assert.isTrue(actualTok in tc.expected);
  72. seen[actualTok] = true;
  73. });
  74. Object.keys(tc.expected).forEach(expectedTok => {
  75. assert.isTrue(expectedTok in seen);
  76. });
  77. });
  78. it(`${tc.input} should have the correct token ids`, () => {
  79. Object.keys(actual).forEach(actualTok => {
  80. assert.equal(tc.expected[actualTok][0], actual[actualTok][0]);
  81. });
  82. });
  83. });
  84. };
  85. let checkTfIdfVector = tc => {
  86. let actual = toksToTfIdfVector(tokenize(tc.input), vocab_idfs);
  87. it(`${tc.input} should have the correct tf-idf`, () => {
  88. Object.keys(actual).forEach(actualTok => {
  89. let delta = Math.abs(tc.expected[actualTok][1] - actual[actualTok][1]);
  90. assert.isTrue(delta <= EPSILON);
  91. });
  92. });
  93. };
  94. // run the tests
  95. for (let i = 0; i < testCases.length; i++) {
  96. checkTokenGeneration(testCases[i]);
  97. checkTfIdfVector(testCases[i]);
  98. }
  99. });
  100. });