2014-06-17 12 views
6

के लिए ब्रिटिश और अमेरिकी अंग्रेजी को सामान्यीकृत करें Elasticsearch में ब्रिटिश और अमेरिकी अंग्रेजी को सामान्य करने के लिए सबसे अच्छा अभ्यास है?एलैस्टिकसेर्च

Synonym Token Filter का उपयोग करना एक अविश्वसनीय रूप से लंबी कॉन्फ़िगरेशन फ़ाइल की आवश्यकता है। वास्तव में यूके और यूएस अंग्रेजी में कई हजार अलग-अलग वर्तनी वाले शब्द हैं और शब्दों की वास्तव में व्यापक सूची ढूंढना लगभग असंभव है। यहां एक list of almost 2.000 words है, लेकिन यह पूर्ण होने से बहुत दूर है।

अधिमानतः, मैं rules to transform US to UK अंग्रेज़ी के साथ एक ES विश्लेषक/फ़िल्टर बनाना चाहता हूं। हो सकता है कि यह बेहतर दृष्टिकोण है, लेकिन मुझे नहीं पता कि कहां से शुरू करना है - इसके लिए मुझे किस प्रकार के फ़िल्टर की आवश्यकता है? इसमें सब कुछ शामिल नहीं है - इसे केवल अधिकांश खोज शब्दों को सामान्य करना चाहिए। जैसे "ग्रे" - "ग्रे", "रंग" - "रंग", "केंद्र" - "केंद्र", आदि

उत्तर

2

यहां कुछ दृष्टिकोण है जो मैं थोड़ी देर के बाद झुकाव के बाद गया था। यह बुनियादी नियमों का एक संयोजन है, "फिक्सेस", और समानार्थी शब्द: सबसे पहले, मूल वर्तनी नियमों के सेट को लागू करने के लिए char_filter लागू करें। यह 100% सही है, लेकिन यह बहुत अच्छी तरह से काम करता है:

"char_filter": { 
    "en_char_filter": { "type": "mapping", "mappings": [ 
     # fixes 
     "aerie=>axerie", "aeroplane=>airplane", "aloe=>aloxe", "canoe=>canoxe", "coerce=>coxerce", "poem=>poxem", "prise=>prixse", 
     # whole words 
     "armour=>armor", "behaviour=>behavior", "centre=>center" "colour=>color", "clamour=>clamor", "draught=>draft", "endeavour=>endeavor", "favour=>favor", "flavour=>flavor", "harbour=>harbor", "honour=>honor", 
     "humour=>humor", "labour=>labor", "litre=>liter", "metre=>meter", "mould=>mold", "neighbour=>neighbor", "plough=>plow", "saviour=>savior", "savour=>savor", 
     # generic transformations 
     "ae=>e", "ction=>xion", "disc=>disk", "gramme=>gram", "isable=>izable", "isation=>ization", "ise=>ize", "ising=>izing", "ll=>l", "oe=>e", "ogue=>og", "sation=>zation", "yse=>yze", "ysing=>yzing" 
    ] } 
} 

"फिक्स" प्रविष्टि है अन्य नियमों की गलत आवेदन को रोकने के लिए। जैसे "prise=>prixse" "पुरस्कार" को "पुरस्कार" में बदलने से रोकता है, जिसका एक अलग अर्थ है। आपको इसे अपनी जरूरतों के हिसाब से अनुकूलित करने की आवश्यकता हो सकती है।

"en_synonym_filter": { "type": "synonym", "synonyms": EN_SYNONYMS } 

यहाँ हमारे पर्यायवाची सूची है कि हमारे उपयोग के मामले के लिए सबसे महत्वपूर्ण कीवर्ड शामिल है:

इसके बाद, सबसे अधिक इस्तेमाल अपवाद को पकड़ने के लिए एक पर्याय फिल्टर शामिल हैं। आप अपनी आवश्यकताओं के लिए इस सूची अनुकूल करने के लिए इच्छा हो सकती है:

EN_SYNONYMS = (
    "accolade, prize => award", 
    "accoutrement => accouterment", 
    "aching, pain => hurt", 
    "acw, anticlockwise, counterclockwise, counter-clockwise => ccw", 
    "adaptor => adapter", 
    "advocate, attorney, barrister, procurator, solicitor => lawyer", 
    "ageing => aging", 
    "agendas, agendum => agenda", 
    "almanack => almanac", 
    "aluminium => aluminum", 
    "america, united states, usa", 
    "amphitheatre => amphitheater", 
    "anti-aliased, anti-aliasing => antialiased", 
    "arbour => arbor", 
    "ardour => ardor", 
    "arse => ass", 
    "artefact => artifact", 
    "aubergine => eggplant", 
    "automobile, motorcar => car", 
    "axe => ax", 
    "bannister => banister", 
    "barbecue => bbq", 
    "battleaxe => battleax", 
    "baulk => balk", 
    "beetroot => beet", 
    "biassed => biased", 
    "biassing => biasing", 
    "biscuit => cookie", 
    "black american, african american, afro-american, negro", 
    "bobsleigh => bobsled", 
    "bonnet => hood", 
    "bulb, electric bulb, light bulb, lightbulb", 
    "burned => burnt", 
    "bussines, bussiness => business", 
    "business man, business people, businessman", 
    "business woman, business people, businesswoman", 
    "bussing => busing", 
    "cactus, cactuses => cacti", 
    "calibre => caliber", 
    "candour => candor", 
    "candy floss, candyfloss, cotton candy", 
    "car park, parking area, parking ground, parking lot, parking-lot, parking place, parking", 
    "carburettor => carburetor", 
    "castor => caster", 
    "cataloguing => cataloging", 
    "catboat, sailboat, sailing boat", 
    "champion, gainer, victor, win, winner => victory", 
    "chat => talk", 
    "chequebook => checkbook", 
    "chequer => checker", 
    "chequerboard => checkerboard", 
    "chequered => checkered", 
    "christmas tree ball, christmas tree ball ornament, christmas ball ornament, christmas bauble", 
    "christmas, x-mas => xmas", 
    "cinema => movies", 
    "clangour => clangor", 
    "clarinettist => clarinetist", 
    "conditioning => conditioner", 
    "conference => meeting", 
    "coriander => cilantro", 
    "corporate => company", 
    "cosmos, universe => outer space", 
    "cosy, cosiness => cozy", 
    "criminal => crime", 
    "curriculums => curricula", 
    "cypher => cipher", 
    "daddy, father, pa, papa => dad", 
    "defence => defense", 
    "defenceless => defenseless", 
    "demeanour => demeanor", 
    "departure platform, station platform, train platform, train station", 
    "dishrag => dish cloth", 
    "dishtowel, dishcloth => dish towel", 
    "doughnut => donut", 
    "downspout => drainpipe", 
    "drugstore => pharmacy", 
    "e-mail => email", 
    "enamoured => enamored", 
    "england => britain", 
    "english => british", 
    "epaulette => epaulet", 
    "exercise, excercise, training, workout => fitness", 
    "expressway, motorway, highway => freeway", 
    "facebook => facebook, social media", 
    "fanny => buttocks", 
    "fanny pack => bum bag", 
    "farmyard => barnyard", 
    "faucet => tap", 
    "fervour => fervor", 
    "fibre => fiber", 
    "fibreglass => fiberglass", 
    "flashlight => torch", 
    "flautist => flutist", 
    "flier => flyer", 
    "flower fly, hoverfly, syrphid fly, syrphus fly", 
    "foot-walk, sidewalk, sideway => pavement", 
    "football, soccer", 
    "forums => fora", 
    "fourth => 4", 
    "freshman => fresher", 
    "chips, fries, french fries", 
    "gaol => jail", 
    "gaolbird => jailbird", 
    "gaolbreak => jailbreak", 
    "gaoler => jailer", 
    "garbage, rubbish => trash", 
    "gasoline => petrol", 
    "gases, gasses", 
    "gauge => gage", 
    "gauged => gaged", 
    "gauging => gaging", 
    "gipsy, gipsies, gypsies => gypsy", 
    "glamour => glamor", 
    "glueing => gluing", 
    "gravesite, sepulchre, sepulture => sepulcher", 
    "grey => gray", 
    "greyish => grayish", 
    "greyness => grayness", 
    "groyne => groin", 
    "gryphon, griffon => griffin", 
    "hand shake, shake hands, shaking hands, handshake", 
    "haulier => hauler", 
    "hobo, homeless, tramp => bum", 
    "new year, new year's eve, hogmanay, silvester, sylvester", 
    "holiday => vacation", 
    "holidaymaker, holiday-maker, vacationer, vacationist => tourist", 
    "homosexual, fag => gay", 
    "inbox, letterbox, outbox, postbox => mailbox", 
    "independence day, 4th of july, fourth of july, july 4th, july 4, 4th july, july fourth, forth of july, 4 july, fourth july, 4th july", 
    "infant, suckling, toddler => baby", 
    "infeasible => unfeasible", 
    "inquire, inquiry => enquire", 
    "insure => ensure", 
    "internet, website => www", 
    "jelly => jam", 
    "jewelery, jewellery => jewelry", 
    "jogging => running", 
    "journey => travel", 
    "judgement => judgment", 
    "kerb => curb", 
    "kiwifruit => kiwi", 
    "laborer => worker", 
    "lacklustre => lackluster", 
    "ladybeetle, ladybird, ladybug => ladybird beetle", 
    "larrikin, scalawag, rascal, scallywag => naughty boy", 
    "leaf => leaves", 
    "licence, licenced, licencing => license", 
    "liquorice => licorice", 
    "lorry => truck", 
    "loupe, magnifier, magnifying, magnifying glass, magnifying lens, zoom", 
    "louvred => louvered", 
    "louvres => louver", 
    "lustre => luster", 
    "mail => post", 
    "mailman => postman", 
    "marriage, married, marry, marrying, wedding => wed", 
    "mayonaise => mayo", 
    "meagre => meager", 
    "misdemeanour => misdemeanor", 
    "mitre => miter", 
    "mom, momma, mummy, mother => mum", 
    "moonlight => moon light", 
    "moult => molt", 
    "moustache, moustached => mustache", 
    "nappy => diaper", 
    "nightlife => night life", 
    "normalcy => normality", 
    "octopus => kraken", 
    "odour => odor", 
    "odourless => odorless", 
    "offence => offense", 
    "omelette => omelet", 
    "# fix torres del paine", 
    "paine => painee", 
    "pajamas => pyjamas", 
    "pantyhose => tights", 
    "parenthesis, parentheses => bracket", 
    "parliament => congress", 
    "parlour => parlor", 
    "persnickety => pernickety", 
    "philtre => filter", 
    "phoney => phony", 
    "popsicle => iced-lolly", 
    "porch => veranda", 
    "pretence => pretense", 
    "pullover, jumper => sweater", 
    "pyjama => pajama", 
    "railway => railroad", 
    "rancour => rancor", 
    "rappel => abseil", 
    "row house, serial house, terrace house, terraced house, terraced housing, town house", 
    "rigour => rigor", 
    "rumour => rumor", 
    "sabre => saber", 
    "saltpetre => saltpeter", 
    "sanitarium => sanatorium", 
    "santa, santa claus, st nicholas, st nicholas day", 
    "sceptic, sceptical, scepticism, sceptics => skeptic", 
    "sceptre => scepter", 
    "shaikh, sheikh => sheik", 
    "shivaree => charivari", 
    "silverware, flatware => cutlery", 
    "simultaneous => simultanous", 
    "sleigh => sled", 
    "smoulder, smouldering => smolder", 
    "sombre => somber", 
    "speciality => specialty", 
    "spectre => specter", 
    "splendour => splendor", 
    "spoilt => spoiled", 
    "street => road", 
    "streetcar, tramway, tram => trolley-car", 
    "succour => succor", 
    "sulphate, sulphide, sulphur, sulphurous, sulfurous => sulfur", 
    "super hero, superhero => hero", 
    "surname => last name", 
    "sweets => candy", 
    "syphon => siphon", 
    "syphoning => siphoning", 
    "tack, thumb-tack, thumbtack => drawing pin", 
    "tailpipe => exhaust pipe", 
    "taleban => taliban", 
    "teenager => teen", 
    "television => tv", 
    "thank you, thanks", 
    "theatre => theater", 
    "tickbox => checkbox", 
    "ticked => checked", 
    "timetable => schedule", 
    "tinned => canned", 
    "titbit => tidbit", 
    "toffee => taffy", 
    "tonne => ton", 
    "transportation => transport", 
    "trapezium => trapezoid", 
    "trousers => pants", 
    "tumour => tumor", 
    "twitter => twitter, social media", 
    "tyre => tire", 
    "tyres => tires", 
    "undershirt => singlet", 
    "university => college", 
    "upmarket => upscale", 
    "valour => valor", 
    "vapour => vapor", 
    "vigour => vigor", 
    "waggon => wagon", 
    "windscreen, windshield => front shield", 
    "world championship, world cup, worldcup", 
    "worshipper, worshipping => worshiping", 
    "yoghourt, yoghurt => yogurt", 
    "zip, zip code, postal code, postcode", 
    "zucchini => courgette" 
) 
+0

भले ही यह हो मेरा सवाल नहीं मैं इसके लिए धन्यवाद! एक बहुत अच्छी नौकरी। –

1

मुझे लगता है कि इस सवाल का जवाब ओपी की प्रारंभिक सवाल से कुछ हद तक रवाना, लेकिन अगर आप सिर्फ बनाम ब्रिटिश अंग्रेजी अमेरिकी को सामान्य करना चाहते हैं वर्तनी वेरिएंट, आप कर सकते हैं एक प्रबंधित आकार की सूची (~ 1,700 प्रतिस्थापन) के लिए यहां देखें: http://www.tysto.com/uk-us-spelling-list.html। मुझे यकीन है कि वहां कुछ भी हैं जो आप एक समेकित मास्टर सूची बनाने के लिए उपयोग कर सकते हैं।

वर्तनी भिन्नता के अलावा, आपको बहुत होना चाहिए, सावधान रहें कि अमेरिकी अंग्रेज़ी में उनके (अनुमानित!) समकक्षों के साथ अलगाव में शब्दों को प्रतिस्थापित न करें। मैं सभी के खिलाफ सलाह देता हूं लेकिन सबसे अधिक जटिल व्याख्याओं के खिलाफ। उदाहरण के लिए, मैं कुछ भी बुरा नहीं देख सकते हैं "वामावर्त घड़ी की विपरीत, वामावर्त, => वामावर्त"

लेकिन यह एक

"आवारा, बेघर, आवारा इस एक

से हो रहा => बम "

" एक बेघर आदमी "=> *" एक बम मैन "सूचकांक होगा, जो बकवास है। (उल्लेख नहीं है कि hobos, बेघर और "tramps" काफी अलग हैं - http://knowledgenuts.com/2014/11/26/the-difference-between-hobos-tramps-and-bums/।)

संक्षेप में, वर्तनी भिन्नता के अलावा, अमेरिकी बनाम ब्रिटिश बोलीभाषा विभाजन जटिल है और इसे सरल सूची में कम नहीं किया जा सकता -यूपीएस।

पीएस यदि आप वास्तव में यह अधिकार करना चाहते हैं (यानी, व्याकरण संबंधी संदर्भ आदि के लिए खाता।), आपको शायद ईएस इंडेक्स को हिट करने से पहले ब्रिटिश से अमेरिकी अंग्रेज़ी (या विपरीत, आपकी जरूरतों के आधार पर) "अनुवाद" करने के लिए एक संदर्भ-संवेदनशील पैराफ्रेश मॉडल की आवश्यकता होगी। यह ऑफ-द-शेल्फ सांख्यिकीय अनुवाद मॉडल का उपयोग करके पर्याप्त समांतर डेटा के साथ किया जा सकता है, या यहां तक ​​कि कुछ कस्टम, इन-हाउस सॉफ़्टवेयर जो प्राकृतिक भाषा पार्सिंग, पीओएस टैगिंग, चंकिंग इत्यादि का उपयोग करते हैं।

संबंधित मुद्दे