test_idiolectalyzer.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. import unittest
  2. import sys
  3. import re
  4. sys.path.append("..")
  5. import idiolectalyzer
  6. import collections
  7. def readTestText(testTextFile):
  8. with open(testTextFile,'r') as testFile:
  9. testText=testFile.read()
  10. testFile.close()
  11. return testText
  12. class setupStripData(object):
  13. def __init__(self, testTextFile):
  14. testText = readTestText(testTextFile)
  15. testText = idiolectalyzer.stripData(testText)
  16. self.lineBreaks = testText.count('\n')
  17. self.doubleSpaces = len(re.findall(' [^ ]', testText))
  18. self.commas = testText.count('\.')
  19. class testTextAnalysis(unittest.TestCase):
  20. def testStripData(self):
  21. testData = setupStripData("mockdata/withlinebreaks.txt")
  22. lineBreaks = testData.lineBreaks
  23. doubleSpaces = testData.doubleSpaces
  24. commas = testData.commas
  25. self.assertEqual(lineBreaks,0)
  26. self.assertEqual(doubleSpaces,0)
  27. self.assertEqual(commas,0)
  28. testData = setupStripData("mockdata/lotsofpunctuation.txt")
  29. lineBreaks = testData.lineBreaks
  30. doubleSpaces = testData.doubleSpaces
  31. commas = testData.commas
  32. self.assertEqual(lineBreaks,0)
  33. self.assertEqual(doubleSpaces,0)
  34. self.assertEqual(commas,0)
  35. def testGetTextSample(self):
  36. #this is a command line function
  37. pass
  38. def testStructureMarkers(self):
  39. testTextFile="mockdata/lotsofpunctuation.txt"
  40. testText = readTestText(testTextFile)
  41. lowercase = idiolectalyzer.checkStructureMarkers(testText,'lowercase')
  42. self.assertEqual(lowercase,96)
  43. doublespace = idiolectalyzer.checkStructureMarkers(testText,'doublespace')
  44. self.assertAlmostEqual(doublespace,.58,places=2)
  45. unusualcount = idiolectalyzer.checkStructureMarkers(testText,'unusualspacing')
  46. self.assertAlmostEqual(unusualcount,.39,places=2)
  47. linebreak = idiolectalyzer.checkStructureMarkers(testText,'linebreak')
  48. self.assertAlmostEqual(linebreak,.39,places=2)
  49. def testCountFunctionWords(self):
  50. testTextFile="mockdata/251words.txt"
  51. expectedCount = 114
  52. testText = readTestText(testTextFile)
  53. countedCount=idiolectalyzer.countFunctionWords(testText)
  54. self.assertEqual(countedCount,expectedCount)
  55. def testCalculateLexicalDensity(self):
  56. testTextFile="mockdata/251words.txt"
  57. testText = readTestText(testTextFile)
  58. expectedDensity = 54
  59. density = idiolectalyzer.calculateLexicalDensity(testText)
  60. self.assertEqual(expectedDensity,density)
  61. return
  62. def testFindRepeatWords(self):
  63. testTextFile="mockdata/251words.txt"
  64. testText = readTestText(testTextFile)
  65. expectedCounts = {'his': 7, 'every': 5, 'like': 4, 'thou': 3, 'however,': 3, 'was': 6, 'he': 6, 'and': 13, 'which': 3, 'a': 6, 'the': 21}
  66. counts = idiolectalyzer.findRepeatWords(testText,3)
  67. self.assertEqual(expectedCounts, counts)
  68. expectedCounts = [('the', 21), ('and', 13), ('his', 7), ('was', 6), ('he', 6)]
  69. counts = idiolectalyzer.findRepeatWords(testText,3,maxWords=5)
  70. self.assertEqual(expectedCounts, counts)
  71. counts = idiolectalyzer.findRepeatWords(testText,3,maxWords='invalid')
  72. self.assertEqual(False,counts)
  73. counts = idiolectalyzer.findRepeatWords(testText,3,context='invalid')
  74. self.assertEqual(False,counts)
  75. def testHowCommonIs(self):
  76. google1965Expectation = 8131
  77. allgoogleExpectation = 6321
  78. pieRank = idiolectalyzer.howCommonIs( "pie" )
  79. self.assertEqual(pieRank,allgoogleExpectation)
  80. pieRank = idiolectalyzer.howCommonIs( "pie",context='google_1965')
  81. self.assertEqual(pieRank,google1965Expectation)
  82. pieRank = idiolectalyzer.howCommonIs( "pie",context='invalid_list')
  83. self.assertFalse(pieRank)
  84. pieRank = idiolectalyzer.howCommonIs( "pyropedonecrobestiality")
  85. self.assertEqual(pieRank,"unique")
  86. def testCommonMisspellings(self):
  87. testTextFile="mockdata/withspellingerrors.txt"
  88. testText = readTestText(testTextFile)
  89. spellingErrorsCount=idiolectalyzer.findCommonMisspellings(testText,'count')
  90. expectedCounts = {'ecstacy': 1, 'becuase': 4, 'heigth': 7}
  91. self.assertEqual(expectedCounts, spellingErrorsCount)
  92. testTextFile="mockdata/251words.txt"
  93. testText = readTestText(testTextFile)
  94. spellingErrorsCount=idiolectalyzer.findCommonMisspellings(testText,'count')
  95. expectedResult = None
  96. self.assertEqual(spellingErrorsCount,expectedResult)
  97. def testFrequencyOfRepeats(self):
  98. counts = [('the', 21), ('and', 13), ('his', 7), ('was', 6), ('he', 6), ('alsidkfjads', 7)]
  99. expectedFrequency = [3, 67, 'unique', 1, 30, 53]
  100. frequency = idiolectalyzer.frequencyOfRepeats(counts)
  101. self.assertEqual(expectedFrequency,frequency)
  102. def testTextDataObjectCreate(self):
  103. testTextFile="mockdata/lotsofpunctuation.txt"
  104. testText = readTestText(testTextFile)
  105. strippedText = idiolectalyzer.stripData(testText)
  106. testDataObject = idiolectalyzer.textData()
  107. testDataObject.fill(testText)
  108. self.assertIsInstance(testDataObject, idiolectalyzer.textData)
  109. if __name__ == '__main__':
  110. unittest.main()