lonelocust
/
Idiolectalyzer


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
							import unittest
import sys
import re
sys.path.append("..")
import idiolectalyzer
import collections

def readTestText(testTextFile):
    with open(testTextFile,'r') as testFile:
        testText=testFile.read()
    testFile.close()
    return testText

class setupStripData(object):
    def __init__(self, testTextFile):
        testText = readTestText(testTextFile)
        testText = idiolectalyzer.stripData(testText)
        self.lineBreaks = testText.count('\n')
        self.doubleSpaces = len(re.findall('  [^ ]', testText))
        self.commas = testText.count('\.')

class testTextAnalysis(unittest.TestCase):
    
    def testStripData(self):
        testData = setupStripData("mockdata/withlinebreaks.txt")
        lineBreaks = testData.lineBreaks
        doubleSpaces = testData.doubleSpaces
        commas = testData.commas
        self.assertEqual(lineBreaks,0)
        self.assertEqual(doubleSpaces,0)
        self.assertEqual(commas,0)
        
        testData = setupStripData("mockdata/lotsofpunctuation.txt")
        lineBreaks = testData.lineBreaks
        doubleSpaces = testData.doubleSpaces
        commas = testData.commas
        self.assertEqual(lineBreaks,0)
        self.assertEqual(doubleSpaces,0)
        self.assertEqual(commas,0)   
    
    def testGetTextSample(self):
        #this is a command line function
        pass
    
    def testStructureMarkers(self):
        testTextFile="mockdata/lotsofpunctuation.txt"
        testText = readTestText(testTextFile)
        lowercase = idiolectalyzer.checkStructureMarkers(testText,'lowercase')
        self.assertEqual(lowercase,96)
        doublespace = idiolectalyzer.checkStructureMarkers(testText,'doublespace')
        self.assertAlmostEqual(doublespace,.58,places=2)
        unusualcount = idiolectalyzer.checkStructureMarkers(testText,'unusualspacing')
        self.assertAlmostEqual(unusualcount,.39,places=2)
        linebreak = idiolectalyzer.checkStructureMarkers(testText,'linebreak')
        self.assertAlmostEqual(linebreak,.39,places=2)
        
    def testCountFunctionWords(self):
        testTextFile="mockdata/251words.txt"
        expectedCount = 114
        testText = readTestText(testTextFile)
        countedCount=idiolectalyzer.countFunctionWords(testText)
        self.assertEqual(countedCount,expectedCount)
    
    def testCalculateLexicalDensity(self):
        testTextFile="mockdata/251words.txt"
        testText = readTestText(testTextFile)
        expectedDensity = 54
        density = idiolectalyzer.calculateLexicalDensity(testText)
        self.assertEqual(expectedDensity,density)
        return
    
    def testFindRepeatWords(self):
        testTextFile="mockdata/251words.txt"
        testText = readTestText(testTextFile)
        expectedCounts = {'his': 7, 'every': 5, 'like': 4, 'thou': 3, 'however,': 3, 'was': 6, 'he': 6, 'and': 13, 'which': 3, 'a': 6, 'the': 21}
        counts = idiolectalyzer.findRepeatWords(testText,3)
        self.assertEqual(expectedCounts, counts)
        expectedCounts = [('the', 21), ('and', 13), ('his', 7), ('was', 6), ('he', 6)]
        counts = idiolectalyzer.findRepeatWords(testText,3,maxWords=5)
        self.assertEqual(expectedCounts, counts)
        counts = idiolectalyzer.findRepeatWords(testText,3,maxWords='invalid')
        self.assertEqual(False,counts)
        counts = idiolectalyzer.findRepeatWords(testText,3,context='invalid')
        self.assertEqual(False,counts)
        
    def testHowCommonIs(self):
        google1965Expectation = 8131
        allgoogleExpectation = 6321
        pieRank = idiolectalyzer.howCommonIs( "pie" )
        self.assertEqual(pieRank,allgoogleExpectation)
        pieRank = idiolectalyzer.howCommonIs( "pie",context='google_1965')
        self.assertEqual(pieRank,google1965Expectation)
        pieRank = idiolectalyzer.howCommonIs( "pie",context='invalid_list')
        self.assertFalse(pieRank)
        pieRank = idiolectalyzer.howCommonIs( "pyropedonecrobestiality")
        self.assertEqual(pieRank,"unique")
    
    def testCommonMisspellings(self):
        testTextFile="mockdata/withspellingerrors.txt"
        testText = readTestText(testTextFile)
        spellingErrorsCount=idiolectalyzer.findCommonMisspellings(testText,'count')
        expectedCounts = {'ecstacy': 1, 'becuase': 4, 'heigth': 7}
        self.assertEqual(expectedCounts, spellingErrorsCount)
        
        testTextFile="mockdata/251words.txt"
        testText = readTestText(testTextFile)
        spellingErrorsCount=idiolectalyzer.findCommonMisspellings(testText,'count')
        expectedResult = None
        self.assertEqual(spellingErrorsCount,expectedResult)
        
    def testFrequencyOfRepeats(self):
        counts = [('the', 21), ('and', 13), ('his', 7), ('was', 6), ('he', 6), ('alsidkfjads', 7)]
        expectedFrequency = [3, 67, 'unique', 1, 30, 53]
        frequency = idiolectalyzer.frequencyOfRepeats(counts)
        self.assertEqual(expectedFrequency,frequency)
        
    def testTextDataObjectCreate(self):
        testTextFile="mockdata/lotsofpunctuation.txt"
        testText = readTestText(testTextFile)
        strippedText = idiolectalyzer.stripData(testText)
        testDataObject = idiolectalyzer.textData()
        testDataObject.fill(testText)
        self.assertIsInstance(testDataObject, idiolectalyzer.textData)
                  
if __name__ == '__main__':
    unittest.main()