#! /usr/bin/env python
import re
import string

"""plural is a set of data and functions for pluralization of English words, based on Damian Conway's
\"An Algorithmic Approach to English Pluralization\" http://www.csse.monash.edu.au/~damian and his Inflect.pm
Perl module.  At the moment, all I'm doing is the pluralization of nouns.


"""

irregularS = {
    "ephemeris" : ("ephemerides",),
    "iris" : ("irises", "irides"),
    "clitoris" : ("clitorises", "clitorides"),
    "corpus" : ("corpuses", "corpora"),
    "opus" : ("opuses", "opera"),
    "genus" : ("genera",),
    "mythos" : ("mythoi",),
    "penis" : ("penises", "penes"),
    "testis" : ("testes", ),
    }

irregularSList = irregularS.keys()

irregular = {
    "child" : ("children",),
    "brother" : ("brothers", "brethren",),
    "loaf" : ("loaves",),
    "hoof" : ("hoofs", "hooves",),
    "beef" : ("beefs", "beeves",),
    "money" : ("monies",),
    "mongoose" : ("mongooses",),
    "ox" : ("oxen",),
    "cow" : ("cows", "kine",),
    "soliloquy" : ("soliloquies",),
    "graffito" : ("graffiti",),
    "prima donna" : ("prima donnas", "prime donne",),
    "octopus" : ("octopuses", "octopodes",),
    "genie" : ("genies", "genii",),
    "ganglion" : ("ganglions", "ganglia",),
    "trilby" : ("trilbys",),
    "turf" : ("turfs", "turves",),
    }

irregularList = irregular.keys()

# a -> ata
classicalAtoATA = ( "anathema", "bema", "carcinoma", "charisma", "diploma",
               "dogma", "drama", "edema", "enema", "enigma", "lemma",
               "lymphoma", "magma", "melisma", "miasma", "oedema",
               "sarcoma", "schema", "soma", "stigma", "stoma", "trauma",
               "gumma", "pragma",
               )


unconditionalAtoAE = ( "alumna", "alga", "vertebra",)
classicalAtoAE = ( "amoeba", "antenna", "formula", "hyperbola",
                "medusa", "nebula", "parabola", "abscissa",
                "hydra", "nova", "lacuna", "aurora", ".*umbra",
                )
classicalENtoINA = ( "stamen", "foramen", "lumen",)
unconditionalUMtoA = ( "bacterium", "agendum", "desideratum", "erratum",
                       "stratum", "datum", "ovum", "extremum", "candelabrum",)
classicalUMtoA = ( "maximum", "minimum", "momentum", "optimum",
                   "quantum", "cranium", "curriculum", "dictum",
                   "phylum", "aquarium", "compendium", "emporium",
                   "enconium", "gymnasium", "honorarium", "interregnum",
                   "lustrum", "memorandum", "millenium", "rostrum", 
                   "spectrum", "speculum", "stadium", "trapezium",
                   "ultimatum", "medium", "vacuum", "velum", 
                   "consortium",)

unconditionalUStoI = ( "alumnus", "alveolus", "bacillus", "bronchus",
                       "locus", "nucleus", "stimulus", "meniscus",)

classicalUStoI = (  "focus", "radius", "genius",
                    "incubus", "succubus", "nimbus",
                    "fungus", "nucleolus", "stylus",
                    "torus", "umbilicus", "uterus",)

                     
classicalUStoUS = ( "status", "apparatus", "prospectus", "sinus",
                    "hiatus", "impetus", "plexus",)

unconditionalONtoA = ( "criterion", "perihelion", "aphelion",
                       "phenomenon", "prolegomenon", "noumenon",
                       "organon", "asyndeton", "hyperbaton",)
classicalONtoA = ( "oxymoron",)
classicalOtoI = ( "solo",  "soprano", "basso", "alto", "canto",
                  "contralto", "tempo",)
unconditionalOtoOS = ( "albino", "archipelago", "armadillo",
                      "commando", "crescendo", "fiasco",
                      "ditto", "dynamo", "embryo",
                      "ghetto", "guano", "inferno",
                      "jumbo", "lumbago", "magneto",
                      "manifesto", "medico", "octavo",
                      "photo", "pro",  "quarto", 
                      "lingo", "generalissimo",
                      "stylo", "rhino",)
unconditionalEXtoICES = ( "codex", "murex", "silex",)
classicalEXtoICES = ( "vortex", "vertex", "cortex", "latex",
                      "pontifex", "apex",  "index", "simplex",)
arabic_toI = ( "afrit", "afreet", "efreet",)
hebrew_toIM = ( "goy",  "seraph", "cherub",)
unconditionalMANtoMANS = ( "human",
                           "alabaman", "bahaman", "burman", "german",
                           "hiroshiman", "liman", "nakayaman", "oklahoman",
                           "panaman", "selman", "sonaman", "tacoman", "yakiman",
                           "yokohaman", "yuman",
                           )

uninflected = ( "fish", "tuna", "salmon", "mackerel", "trout",
                "bream", "carp", "cod", "flounder", "whiting", 
                "deer", "sheep", "wildebeest", "swine", "eland", "bison",
                "elk",
                "portuguese", "japanese", "chinese", "vietnamese", "burmese",
                "lebanese", "siamese", "senegalese", "bhutanese", "sinhalese",
                "pox",
                "graffiti", "djinn",
                "breeches", "britches", "clippers", "gallows", "hijinks",
                "headquarters", "pliers", "scissors", "testes", "herpes",
                "pincers", "shears", "proceedings",
                "cantus", "coitus", "nexus",
                "contretemps", "corps", "debris",
                "measles", "mumps",
                "diabetes", "jackanapes", "series", "species", "rabies",
                "chassis", "innings", "news", "mews",
                )

uninflectedRE = re.compile(r'(.*fish|.*deer|.*sheep|.*pox|.*ois|.*measles|sea[- ]bass|.*itis)\Z')

singularS = ( ".*ss",
              "acropolis", "aegis", "alias", "arthritis", "asbestos", "atlas",
              "bathos", "bias", "bronchitis", "bursitis", "caddis", "cannabis",
              "canvas", "chaos", "cosmos", "dais", "digitalis", "encephalitis",
              "epidermis", "ethos", "gas", "glottis", "hepatitis", "hubris",
              "ibis", "lens", "mantis", "marquis", "metropolis",
              "neuritis", "pathos", "pelvis", "polis", "rhinoceros",
              "sassafras", "tonsillitis", "trellis", ".*us",
              )
    
countZero = (0, "no", "zero", "nil")
countOne = (1, "1", "a", "an", "one", "each", "every", "this", "that")

pronouns = { "i" : "we",
             "myself" : "ourselves",
             "you" : "you",
             "yourself" : "yourselves",
             "she" : "they",
             "herself" : "themselves",
             "he" : "they",
             "himself" : "themselves",
             "it" : "they",
             "itself" : "themselves",
             "they" : "they",
             "themself" : "themselves",
             }

# families of irregular suffixes
irregSuffixRE = re.compile(r'(.*goose|.*tooth|.*foot|.*[ml]ouse|.*man|.*zoon|.*[csx]is)\Z')
irregSuffixes = { 'goose' : 'geese',
                  'tooth' : 'teeth',
                  'ouse' : 'ice',
                  'man' : 'men',
                  'zoon': 'zoa',
                  'is' : 'es',
                  }

incompletelyAssimilated = { 'trix' : ('trice', 'trixes'),
                            'eau' : ('eaux', 'eaus'),
                            'ieu' : ('ieux', 'ieus'),
                            'nx' : ('nges', 'nxes'),
                            'ch' : ('ches', 'ches'),
                            'sh' : ('shes', 'shes'),
                            }
                                    
def isUninflected(word):
    if word in uninflected:
        return 1
    if uninflectedRE.match(word):
        return 1
    return 0

def isIrregular(word):
    return irregular.has_key(word)

def hasIrregularSuffix(word):
    return irregSuffixRE.match(word)

def replIrregSuffix(matchOb):
    word = matchOb.string
    for suffix, pluralSuffix in irregSuffixes.items():
        pos = string.rfind(word, suffix)
        if pos >= 0:
            word = word[:pos] +  pluralSuffix
            break
    return word

def hasAssimilatedClassicalSuffix(word):
    if word in unconditionalAtoAE:
        return 1
    if word in unconditionalUMtoA:
        return 1
    if word in unconditionalONtoA:
        return 1
    if word in unconditionalUStoI:
        return 1
    if word in unconditionalEXtoICES:
        return 1
    if word in unconditionalMANtoMANS:
        return 1
    return 0

def fixAssimilatedSuffix(word):
    if word in unconditionalAtoAE:
        return word + 'e'
    if word in unconditionalUMtoA:
        return word[:-2] + 'a'
    if word in unconditionalONtoA:
        return word[:-2] + 'a'
    if word in unconditionalUStoI:
        return word[:-2] + 'i'
    if word in unconditionalEXtoICES:
        return word[:-2] + 'ices'
    if word in unconditionalMANtoMANS:
        return word[:-3] + 'mans'
    return word

def suffix(word, suff):
    if word[-len(suff):] == suff:
        return 1
    return 0

def inflection(word, suffix, pluralSuffix):
    pos = string.rfind(word, suffix)
    if pos >= 0:
        word = word[:pos] + pluralSuffix
    return word
        
def incompletelyAssimilatedClassical(word):
    for suff in incompletelyAssimilated.keys():
        if suffix(word, suff):
            return 1
    if word in classicalAtoATA:
        return 1
    if word in classicalAtoAE:
        return 1
    if word in classicalENtoINA:
        return 1
    if word in classicalUMtoA:
        return 1
    if word in classicalUStoI:
        return 1
    if word in classicalUStoUS:
        return 1
    if word in classicalOtoI:
        return 1
    return 0

def fixIncompletelyAssimilated(word, classical):
    for suff, pluralSuff in incompletelyAssimilated.items():
        if suffix(word, suff):
            if classical:
                return inflection(word, suff, pluralSuff[0])
            else:
                return inflection(word, suff, pluralSuff[1])
    if word in classicalAtoATA:
        if classical:
            return inflection(word, 'a', 'ata')
        else:
            return word + 's'
    if word in classicalAtoAE:
        if classical:
            return inflection(word, 'a', 'ae')
        else:
            return word + 's'
    if word in classicalENtoINA:
        if classical:
            return inflection(word, 'en', 'ina')
        else:
            return word + 's'
    if word in classicalUMtoA:
        if classical:
            return inflection(word, 'um', 'a')
        else:
            return word + 's'
    if word in classicalUStoI:
        if classical:
            return inflection(word, 'us', 'i')
        else:
            return word + 'es'
    if word in classicalUStoUS:
        if classical:
            return word
        else:
            return word + 'es'
    if word in classicalOtoI:
        if classical:
            return inflection(word, 'o', 'i')
        else:
            return word + 's'

def pluralize(word, number=2, classical=0):
    """
    
    >>> pluralize('cat', 1)
    'cat'
    >>> pluralize('cat', 'one')
    'cat'
    >>> pluralize('cat')
    'cats'
    >>> pluralize('Japanese', 2)
    'Japanese'
    >>> pluralize('Japanese')
    'Japanese'
    >>> pluralize('fish')
    'fish'
    >>> pluralize('starfish')
    'starfish'
    >>> pluralize('sea-bass')
    'sea-bass'
    >>> pluralize('chamois')
    'chamois'
    >>> pluralize('sheep-rustler')
    'sheep-rustlers'
    >>> pluralize('I')
    'we'
    >>> pluralize('mongoose')
    'mongooses'
    >>> pluralize('mongoose', classical=1)
    'mongooses'
    >>> pluralize('ox')
    'oxen'
    >>> pluralize('brother')
    'brothers'
    >>> pluralize('brother', classical=1)
    'brethren'
    >>> pluralize('goose')
    'geese'
    >>> pluralize('louse')
    'lice'
    >>> pluralize('eye-tooth')
    'eye-teeth'
    >>> pluralize('he-man')
    'he-men'
    >>> pluralize('protozoon')
    'protozoa'
    >>> pluralize('aphelion')
    'aphelia'
    >>> pluralize('crisis')
    'crises'
    >>> pluralize('bacterium')
    'bacteria'
    >>> pluralize('alga')
    'algae'
    >>> pluralize('executrix', classical=1)
    'executrice'
    >>> pluralize('beau', classical=1)
    'beaux'
    >>> pluralize('milieu', classical=1)
    'milieux'
    >>> pluralize('sphynx', classical=1)
    'sphynges'
    >>> pluralize('church', classical=1)
    'churches'
    >>> pluralize('church', classical=0)
    'churches'
    >>> pluralize('sash', classical=1)
    'sashes'
    >>> pluralize('sash', classical=0)
    'sashes'
    >>> pluralize('lass', classical=1)
    'lasses'
    >>> pluralize('lass', classical=0)
    'lasses'
    >>> pluralize('dogma', classical=1)
    'dogmata'
    >>> pluralize('pragma', classical=1)
    'pragmata'
    >>> pluralize('amoeba', classical=1)
    'amoebae'
    >>> pluralize('stamen', classical=1)
    'stamina'
    >>> pluralize('stadium', classical=1)
    'stadia'
    >>> pluralize('focus', classical=1)
    'foci'
    >>> pluralize('genius', classical=1)
    'genii'
    >>> pluralize('status', classical=1)
    'status'
    >>> pluralize('soprano', classical=1)
    'soprani'
    >>> pluralize('testis', classical=1)
    'testes'
    >>> pluralize('testis')
    'testes'
    >>> pluralize('clitoris', classical=1)
    'clitorides'
    >>> pluralize('clitoris', classical=0)
    'clitorises'

    """
    if number in countOne:
        return word
    retWord = ''
    original = word
    word = string.lower(word)
    if isUninflected(word):
        retWord = word
    elif pronouns.has_key(word):
        retWord = pronouns[word]
    elif isIrregular(word):
        if not classical:
            retWord = irregular[word][0]
        else:
            retWord = irregular[word][-1]
    elif hasAssimilatedClassicalSuffix(word):
        retWord = fixAssimilatedSuffix(word)
    elif hasIrregularSuffix(word):
        retWord = irregSuffixRE.sub(replIrregSuffix, word)
    elif word in irregularS.keys():
        if not classical:
            retWord = irregularS[word][0]
        else:
            retWord = irregularS[word][-1]
    elif incompletelyAssimilatedClassical(word):
        retWord = fixIncompletelyAssimilated(word, classical)
    elif word[-2:] in ('ch', 'sh', 'ss'):
        retWord = word + 'es'
    else:
        retWord = word + 's'
    if word != original:
        if word != 'i': # we is more likely to be right than We, since I is always capitalized...
            retWord = string.capitalize(retWord)
    return retWord
    


def _test(verbose=0):
    import doctest
    import plural
    return doctest.testmod(plural, verbose=verbose)

if __name__ == '__main__':
    _test(0)
