parser.js 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556
  1. var levenshtein = require('fast-levenshtein').get;
  2. var _ = require('underscore');
  3. module.exports = function(game) {
  4. var lex = {
  5. start_word: {},
  6. pos: {},
  7. players: {},
  8. monsters: {},
  9. items: {},
  10. weapons: {},
  11. };
  12. /*
  13. <builtin>
  14. (optional)
  15. list...
  16. sentence = subject verb
  17. sentence = subject verb direct_object
  18. sentence = subject verb indirect_object direct_object
  19. subject = noun
  20. direct_object = noun
  21. indirect_object = noun
  22. noun = noun
  23. noun = noun_clause
  24. noun = adjective noun
  25. verb = verb
  26. verb = adverb verb
  27. adjective = adjective
  28. adjective = adverb adjective
  29. adjective = adjective_phrase
  30. adverb = adverb
  31. adverb = adverb adverb
  32. adverb = adverb_phrase
  33. */
  34. var sample = {
  35. }
  36. function parse(raw) {
  37. var list = wordSplit(raw.toLowerCase()).map(stemmer);
  38. return parseSentence(list);
  39. }
  40. function parseSentence(list) {
  41. // only active verb forms are considered
  42. //sentence = subject verb
  43. //sentence = subject verb direct_object
  44. //sentence = subject verb indirect_object direct_object
  45. var s = parseSubject(list);
  46. if(r === false) {
  47. console.log('parser: [sentence] failed to find subject');
  48. }
  49. // console.log('subject', s);
  50. var v = parseSentVerb(s.remain);
  51. if(v === false) {
  52. console.log('parser: [sentence] failed to find verb');
  53. }
  54. // console.log('sentverb', v);
  55. // detect phrasal verbs using prepositions
  56. var verb_preps = [];
  57. var l = v.remain;
  58. do {
  59. var p = parsePrepPhrase(l);
  60. if(p) {
  61. l = p.remain;
  62. verb_preps.push(p);
  63. }
  64. } while(p);
  65. var iobj = parseIndirectObject(l);
  66. var r;
  67. if(iobj === false) {
  68. console.log('parser: [sentence] failed to find indirect object');
  69. r = v.remain;
  70. }
  71. else r = iobj.remain;
  72. var dobj = parseDirectObject(r);
  73. if(v === false) {
  74. console.log('parser: [sentence] failed to find direct object');
  75. }
  76. //console.log(iobj, dobj);
  77. if(iobj && !dobj) {
  78. dobj = iobj;
  79. iobj = null;
  80. }
  81. // add any phrasal verb prepositions to the direct object, implied or otherwise
  82. if(verb_preps.length) {
  83. if(!dobj) {
  84. dobj = {
  85. type: 'pronoun',
  86. pro: '-implied-',
  87. preps: [],
  88. }
  89. }
  90. dobj.preps = dobj.preps.concat(verb_preps);
  91. }
  92. return {
  93. type: 'sentence',
  94. subject: s,
  95. verb: v,
  96. iobj: iobj,
  97. dobj: dobj,
  98. };
  99. }
  100. function parseSubject(list) {
  101. return parseNounPhrase(list);
  102. }
  103. function parseSentVerb(list) {
  104. return parseVerbPhrase(list);
  105. }
  106. function parseNounPhrase(list) {
  107. if(!list || list.length == 0) { return false; }
  108. // optional article
  109. var ar = parseArticle(list);
  110. var l = (ar && ar.remain) || list;
  111. //console.log(l)
  112. // BUG: ambiguous or compound nouns starting with an adjective: "Green Lantern"
  113. // many optional adjectives
  114. var adjs = [];
  115. do {
  116. var a = parseAdjective(l);
  117. if(a) {
  118. l = a.remain;
  119. adjs.push(a);
  120. }
  121. } while(a);
  122. //console.log(l)
  123. var n = parseNoun(l);
  124. if(n == false) {
  125. return false;
  126. }
  127. //console.log(n)
  128. // TODO: prepositions
  129. var preps = [];
  130. l = n.remain;
  131. do {
  132. var a = parsePrepPhrase(l);
  133. if(a) {
  134. l = a.remain;
  135. preps.push(a);
  136. }
  137. } while(a);
  138. return {
  139. type: 'noun_phrase',
  140. noun: n,
  141. adjectives: adjs,
  142. article: ar,
  143. preps: preps,
  144. remain: n.remain,
  145. }
  146. }
  147. function parseArticle(list) {
  148. if(!isArticle(list[0])) return false;
  149. return {
  150. type: 'article',
  151. article: list[0],
  152. remain: list.slice(1),
  153. }
  154. }
  155. function parseAdjective(list) {
  156. // TODO: implement
  157. return false;
  158. }
  159. function parseNoun(list) {
  160. if(!list || list.length == 0) return false;
  161. var sn = list[0];
  162. var word;
  163. if(lex.start_word[sn]) {
  164. //console.log('found startword');
  165. for(var i = 0; i < lex.start_word[sn].length; i++) {
  166. var w = lex.start_word[sn][i];
  167. if(w.pos != 'noun') continue;
  168. function compareList(sent, sn, comp) {
  169. var len = Math.min(comp.length, sent.length - sn);
  170. for(var i = 0; i < len; i++) {
  171. if(sent[sn + i] != comp[i]) return false;
  172. }
  173. return i < comp.length - 1 ? false : true;
  174. }
  175. var x = compareList(list, 1, w.follows);
  176. if(x) {
  177. word = w;
  178. break;
  179. }
  180. }
  181. }
  182. if(word) {
  183. return {
  184. type: 'noun',
  185. noun: word.full,
  186. remain: list.slice(1 + word.follows.length),
  187. }
  188. }
  189. else {
  190. return {
  191. type: 'noun',
  192. noun: list[0],
  193. remain: list.slice(1),
  194. }
  195. }
  196. }
  197. function parsePrepPhrase(list) {
  198. if(!isPreposition(list[0])) return false;
  199. var l = list.slice(1);
  200. var np = parseNounPhrase(l);
  201. var l2;
  202. if(!np) {
  203. console.log('parser: broken prepositional phrase')
  204. l2 = l;
  205. }
  206. else l2 = np.remain;
  207. return {
  208. type: 'prep_phrase',
  209. prep: {
  210. type: 'prep',
  211. prep: list[0],
  212. remain: l,
  213. },
  214. obj: np,
  215. remain: l2,
  216. };
  217. }
  218. function parseVerbPhrase(list) {
  219. var advs = [];
  220. var l = list;
  221. do {
  222. var a = parseAdverb(l);
  223. if(a) {
  224. l = a.remain;
  225. advs.push(a);
  226. }
  227. } while(a);
  228. var v = parseVerb(l);
  229. if(!v) return false;
  230. return {
  231. type: 'verb_phrase',
  232. verb: v,
  233. adverbs: advs,
  234. remain: v.remain,
  235. };
  236. }
  237. function parseAdverb(list) {
  238. // TODO: implement
  239. return false;
  240. }
  241. function parseVerb(list) {
  242. if(!list || list.length == 0) return false;
  243. return {
  244. type: 'verb',
  245. verb: list[0],
  246. remain: list.slice(1),
  247. };
  248. }
  249. function parseIndirectObject(list) {
  250. return parseNounPhrase(list);
  251. }
  252. function parseDirectObject(list) {
  253. return parseNounPhrase(list)
  254. }
  255. // var stemmed_list = raw_list.map(stemmer);
  256. function seqNorm(x) {
  257. if(typeof x == 'Object') return x;
  258. return {what: x};
  259. }
  260. function actionFromVerb(word) {
  261. return lex.verbs[word];
  262. }
  263. function wordSplit(text) {
  264. // minimize whitespace first
  265. return text.replace(/\s+/g, ' ').split(' ');
  266. }
  267. function wsMinTrim(text) {
  268. return text.replace(/\s+/g, ' ').replace(/\s*$/, '').replace(/^\s*/, '');
  269. }
  270. // remove special characters
  271. function stripper(text) {
  272. return text
  273. .replace(/[^0-9a-zA-Z\.]/g, '')
  274. .replace(/\.$/, '')
  275. .replace(/^\./, '')
  276. }
  277. function stemmer(raw) {
  278. return raw.replace(/e?'?s?'?$/, '');
  279. }
  280. var article_words = objectify([
  281. 'a', 'an', 'the', 'one', 'some', 'few',
  282. ]);
  283. function junkFilter(list) {
  284. return list.reduce(function(acc, w) {
  285. if(!article_words[w]) return acc.push[w];
  286. }, []);
  287. }
  288. var preposition_words = objectify([
  289. 'aboard', 'about', 'above', 'across', 'after', 'against', 'along', 'amid', 'among', 'anti', 'around', 'as', 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'besides', 'between', 'beyond', 'but', 'by', 'concerning', 'considering', 'despite', 'down', 'during', 'except', 'excepting', 'excluding', 'following', 'for', 'from', 'in', 'inside', 'into', 'like', 'minus', 'near', 'of', 'off', 'on', 'onto', 'opposite', 'outside', 'over', 'past', 'per', 'plus', 'regarding', 'round', 'save', 'since', 'than', 'through', 'to', 'toward', 'towards', 'under', 'underneath', 'unlike', 'until', 'up', 'upon', 'versus', 'via', 'with', 'within', 'without',
  290. ]);
  291. function prepFilter(list) {
  292. return list.reduce(function(acc, w) {
  293. if(!preposition_words[w]) return acc.push[w];
  294. }, []);
  295. }
  296. function isPreposition(word) {
  297. return preposition_words[word] == true
  298. }
  299. function isArticle(word) {
  300. // console.log(article_words)
  301. return article_words[word] == true
  302. }
  303. // really shitty
  304. function isPlural(word) {
  305. return /s$/.test(word);
  306. }
  307. // half decent
  308. function isPosessive(word) {
  309. return /('s|s')$/.test(word);
  310. }
  311. function classify(word) {
  312. }
  313. // misc helpers
  314. function objectify(arr) {
  315. var out = {};
  316. for(var i = 0; i < arr.length; i++) out[arr[i]] = true;
  317. return out;
  318. }
  319. // preprocessing
  320. function processVerbList(input) {
  321. var out = {
  322. actions: {},
  323. verbs: {},
  324. pos: {},
  325. }
  326. for(var verb in input) {
  327. var action = input[verb].toLowerCase();
  328. var v = stemmer(wsMinTrim(verb.toLowerCase())); // TODO: handle multi-word phrases
  329. lex.actions[action] = lex.actions[action] || [];
  330. lex.actions[action].push(v);
  331. lex.verbs[v] = action;
  332. lex.pos[v] = 'verb';
  333. }
  334. return out;
  335. }
  336. function processPlayerList(input) {
  337. for(var i in input) {
  338. var pl = input[i];
  339. lex.players[pl.name.toLowerCase()] = pl.name;
  340. lex.players[pl.nick.toLowerCase()] = pl.name;
  341. }
  342. }
  343. function processMonsterList(input) {
  344. for(var id in input) {
  345. var mon = input[id];
  346. console.log("monster", mon.name)
  347. if(!mon.name) continue; // shim for incomplete definitions
  348. var l = wordSplit(mon.name.toLowerCase()).map(stripper);
  349. console.log(' ', l);
  350. // handle multi-word nouns
  351. if(l.length > 1) {
  352. lex.start_word[l[0]] = lex.start_word[l[0]] || [];
  353. lex.start_word[l[0]].push({
  354. follows: l.slice(1),
  355. type: 'monster',
  356. pos: 'noun',
  357. id: id,
  358. full: mon.name.toLowerCase(),
  359. })
  360. }
  361. lex.pos[mon.name] = 'noun';
  362. }
  363. }
  364. function processWeaponList(input) {
  365. }
  366. function processItemList(input) {
  367. for(var id in input) {
  368. var item = input[id];
  369. if(!item.name) continue; // shim for incomplete definitions
  370. var l = wordSplit(item.name.toLowerCase()).map(stripper);
  371. // handle multi-word nouns
  372. if(l.length > 1) {
  373. lex.start_word[l[0]] = lex.start_word[l[0]] || [];
  374. lex.start_word[l[0]].push({
  375. follows: l.slice(1),
  376. type: 'item',
  377. pos: 'noun',
  378. id: id,
  379. full: item.name.toLowerCase(),
  380. })
  381. }
  382. lex.pos[item.name] = 'noun';
  383. }
  384. }
  385. function processGame() {
  386. var cfg = require('./parser_config.js')
  387. processVerbList(cfg.verbs);
  388. processPlayerList(game.players);
  389. processMonsterList(game.monsters);
  390. processWeaponList(game.weapons);
  391. processItemList(game.items);
  392. }
  393. return {
  394. processGame: processGame,
  395. parse: parse,
  396. parseSentence: parseSentence,
  397. stemmer: stemmer,
  398. stripper: stripper,
  399. processVerbList: processVerbList,
  400. };
  401. }