123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126 |
- import unittest
- import sys
- import re
- sys.path.append("..")
- import idiolectalyzer
- import collections
- def readTestText(testTextFile):
- with open(testTextFile,'r') as testFile:
- testText=testFile.read()
- testFile.close()
- return testText
- class setupStripData(object):
- def __init__(self, testTextFile):
- testText = readTestText(testTextFile)
- testText = idiolectalyzer.stripData(testText)
- self.lineBreaks = testText.count('\n')
- self.doubleSpaces = len(re.findall(' [^ ]', testText))
- self.commas = testText.count('\.')
- class testTextAnalysis(unittest.TestCase):
-
- def testStripData(self):
- testData = setupStripData("mockdata/withlinebreaks.txt")
- lineBreaks = testData.lineBreaks
- doubleSpaces = testData.doubleSpaces
- commas = testData.commas
- self.assertEqual(lineBreaks,0)
- self.assertEqual(doubleSpaces,0)
- self.assertEqual(commas,0)
-
- testData = setupStripData("mockdata/lotsofpunctuation.txt")
- lineBreaks = testData.lineBreaks
- doubleSpaces = testData.doubleSpaces
- commas = testData.commas
- self.assertEqual(lineBreaks,0)
- self.assertEqual(doubleSpaces,0)
- self.assertEqual(commas,0)
-
- def testGetTextSample(self):
- #this is a command line function
- pass
-
- def testStructureMarkers(self):
- testTextFile="mockdata/lotsofpunctuation.txt"
- testText = readTestText(testTextFile)
- lowercase = idiolectalyzer.checkStructureMarkers(testText,'lowercase')
- self.assertEqual(lowercase,96)
- doublespace = idiolectalyzer.checkStructureMarkers(testText,'doublespace')
- self.assertAlmostEqual(doublespace,.58,places=2)
- unusualcount = idiolectalyzer.checkStructureMarkers(testText,'unusualspacing')
- self.assertAlmostEqual(unusualcount,.39,places=2)
- linebreak = idiolectalyzer.checkStructureMarkers(testText,'linebreak')
- self.assertAlmostEqual(linebreak,.39,places=2)
-
- def testCountFunctionWords(self):
- testTextFile="mockdata/251words.txt"
- expectedCount = 114
- testText = readTestText(testTextFile)
- countedCount=idiolectalyzer.countFunctionWords(testText)
- self.assertEqual(countedCount,expectedCount)
-
- def testCalculateLexicalDensity(self):
- testTextFile="mockdata/251words.txt"
- testText = readTestText(testTextFile)
- expectedDensity = 54
- density = idiolectalyzer.calculateLexicalDensity(testText)
- self.assertEqual(expectedDensity,density)
- return
-
- def testFindRepeatWords(self):
- testTextFile="mockdata/251words.txt"
- testText = readTestText(testTextFile)
- expectedCounts = {'his': 7, 'every': 5, 'like': 4, 'thou': 3, 'however,': 3, 'was': 6, 'he': 6, 'and': 13, 'which': 3, 'a': 6, 'the': 21}
- counts = idiolectalyzer.findRepeatWords(testText,3)
- self.assertEqual(expectedCounts, counts)
- expectedCounts = [('the', 21), ('and', 13), ('his', 7), ('was', 6), ('he', 6)]
- counts = idiolectalyzer.findRepeatWords(testText,3,maxWords=5)
- self.assertEqual(expectedCounts, counts)
- counts = idiolectalyzer.findRepeatWords(testText,3,maxWords='invalid')
- self.assertEqual(False,counts)
- counts = idiolectalyzer.findRepeatWords(testText,3,context='invalid')
- self.assertEqual(False,counts)
-
- def testHowCommonIs(self):
- google1965Expectation = 8131
- allgoogleExpectation = 6321
- pieRank = idiolectalyzer.howCommonIs( "pie" )
- self.assertEqual(pieRank,allgoogleExpectation)
- pieRank = idiolectalyzer.howCommonIs( "pie",context='google_1965')
- self.assertEqual(pieRank,google1965Expectation)
- pieRank = idiolectalyzer.howCommonIs( "pie",context='invalid_list')
- self.assertFalse(pieRank)
- pieRank = idiolectalyzer.howCommonIs( "pyropedonecrobestiality")
- self.assertEqual(pieRank,"unique")
-
- def testCommonMisspellings(self):
- testTextFile="mockdata/withspellingerrors.txt"
- testText = readTestText(testTextFile)
- spellingErrorsCount=idiolectalyzer.findCommonMisspellings(testText,'count')
- expectedCounts = {'ecstacy': 1, 'becuase': 4, 'heigth': 7}
- self.assertEqual(expectedCounts, spellingErrorsCount)
-
- testTextFile="mockdata/251words.txt"
- testText = readTestText(testTextFile)
- spellingErrorsCount=idiolectalyzer.findCommonMisspellings(testText,'count')
- expectedResult = None
- self.assertEqual(spellingErrorsCount,expectedResult)
-
- def testFrequencyOfRepeats(self):
- counts = [('the', 21), ('and', 13), ('his', 7), ('was', 6), ('he', 6), ('alsidkfjads', 7)]
- expectedFrequency = [3, 67, 'unique', 1, 30, 53]
- frequency = idiolectalyzer.frequencyOfRepeats(counts)
- self.assertEqual(expectedFrequency,frequency)
-
- def testTextDataObjectCreate(self):
- testTextFile="mockdata/lotsofpunctuation.txt"
- testText = readTestText(testTextFile)
- strippedText = idiolectalyzer.stripData(testText)
- testDataObject = idiolectalyzer.textData()
- testDataObject.fill(testText)
- self.assertIsInstance(testDataObject, idiolectalyzer.textData)
-
- if __name__ == '__main__':
- unittest.main()
|