आर

2013-08-20 4 views
16

में लिखित संख्या को संख्या में कनवर्ट करें क्या किसी को किसी संख्या के टेक्स्ट प्रस्तुति को वास्तविक संख्या में बदलने के लिए कोई फ़ंक्शन पता है, उदा। 20305 में 'बीस हजार तीन सौ पांच'। मैंने डेटाफ्रेम पंक्तियों में संख्याएं लिखी हैं और उन्हें संख्याओं में परिवर्तित करना चाहते हैं।आर

पैकेज qdap में, आप शब्द (जैसे, 1001 एक हजार एक हो जाता है) के साथ संख्यात्मक प्रतिनिधित्व किया संख्या की जगह ले सकता है, लेकिन नहीं दूसरी तरह के आसपास:

library(qdap) 
replace_number("I like 346457 ice cream cones.") 
[1] "I like three hundred forty six thousand four hundred fifty seven ice cream cones." 
+0

@ हेंक मैं आपके प्रश्न को थोड़ा और स्पष्ट करने के लिए थोड़ा सा लिखता हूं कि आपको शब्दों को संख्या में परिवर्तित करने की आवश्यकता है और इसके विपरीत नहीं। –

+2

मुझे लगता है कि करने के लिए सबसे अच्छी बात उस व्यक्ति को शूट करें जिसने फाइल के रूप में लिखे गए नंबरों के साथ फाइल सबमिट की है। ठीक है, गंभीरता से, मुझे संदेह है कि एक विस्तृत पार्सिंग एल्गोरिदम लिखने के अलावा ऐसा करने का कोई तरीका है जिसमें सभी संख्या-शब्द ('एक', 'दो', ... 'सौ', 'हजार, '...' googol ') साथ ही प्राथमिकता के लिए पेड़-सॉर्टर के कुछ प्रकार। उदाहरण के लिए, आपके उदाहरण में, दो "सौ" हैं, लेकिन उनके पास अनुक्रम में उन शब्दों के आधार पर अलग-अलग अर्थ हैं। –

उत्तर

14

यहाँ एक शुरुआत है कि आप के सैकड़ों करने के लिए मिलना चाहिए है हजारों।

word2num <- function(word){ 
    wsplit <- strsplit(tolower(word)," ")[[1]] 
    one_digits <- list(zero=0, one=1, two=2, three=3, four=4, five=5, 
         six=6, seven=7, eight=8, nine=9) 
    teens <- list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15, 
        sixteen=16, seventeen=17, eighteen=18, nineteen=19) 
    ten_digits <- list(ten=10, twenty=20, thirty=30, forty=40, fifty=50, 
         sixty=60, seventy=70, eighty=80, ninety=90) 
    doubles <- c(teens,ten_digits) 
    out <- 0 
    i <- 1 
    while(i <= length(wsplit)){ 
     j <- 1 
     if(i==1 && wsplit[i]=="hundred") 
      temp <- 100 
     else if(i==1 && wsplit[i]=="thousand") 
      temp <- 1000 
     else if(wsplit[i] %in% names(one_digits)) 
      temp <- as.numeric(one_digits[wsplit[i]]) 
     else if(wsplit[i] %in% names(teens)) 
      temp <- as.numeric(teens[wsplit[i]]) 
     else if(wsplit[i] %in% names(ten_digits)) 
      temp <- (as.numeric(ten_digits[wsplit[i]])) 
     if(i < length(wsplit) && wsplit[i+1]=="hundred"){ 
      if(i>1 && wsplit[i-1] %in% c("hundred","thousand")) 
       out <- out + 100*temp 
      else 
       out <- 100*(out + temp) 
      j <- 2 
     } 
     else if(i < length(wsplit) && wsplit[i+1]=="thousand"){ 
      if(i>1 && wsplit[i-1] %in% c("hundred","thousand")) 
       out <- out + 1000*temp 
      else 
       out <- 1000*(out + temp) 
      j <- 2 
     } 
     else if(i < length(wsplit) && wsplit[i+1] %in% names(doubles)){ 
      temp <- temp*100 
      out <- out + temp 
     } 
     else{ 
      out <- out + temp 
     } 
     i <- i + j 
    } 
    return(list(word,out)) 
} 

परिणाम:

> word2num("fifty seven") 
[[1]] 
[1] "fifty seven" 

[[2]] 
[1] 57 

> word2num("four fifty seven") 
[[1]] 
[1] "four fifty seven" 

[[2]] 
[1] 457 

> word2num("six thousand four fifty seven") 
[[1]] 
[1] "six thousand four fifty seven" 

[[2]] 
[1] 6457 

> word2num("forty six thousand four fifty seven") 
[[1]] 
[1] "forty six thousand four fifty seven" 

[[2]] 
[1] 46457 

> word2num("forty six thousand four hundred fifty seven") 
[[1]] 
[1] "forty six thousand four hundred fifty seven" 

[[2]] 
[1] 46457 

> word2num("three forty six thousand four hundred fifty seven") 
[[1]] 
[1] "three forty six thousand four hundred fifty seven" 

[[2]] 
[1] 346457 

मैं आप पहले से ही बता सकता है कि इस word2num("four hundred thousand fifty") लिए काम नहीं करेगा, क्योंकि यह कैसे लगातार "सौ" और "हजार" मामले को संभालने के लिए पता नहीं है, लेकिन एल्गोरिदम शायद संशोधित किया जा सकता है। किसी को भी इसे संपादित करने के लिए स्वतंत्र महसूस करना चाहिए यदि उनके पास अपने उत्तर में सुधार या निर्माण हो। मैंने सोचा कि यह एक मजेदार समस्या थी (थोड़ी देर के लिए)।

संपादित करें: स्पष्ट रूप से बिल वेनेबल्स में english नामक एक पैकेज है जो इसे उपरोक्त कोड से भी बेहतर प्राप्त कर सकता है।

+0

यह देखने का प्रयास कर रहा है कि अंग्रेजी पैकेज कहां कर सकता है। यह केवल दूसरी तरफ जाना प्रतीत होता है लेकिन शायद मुझे यह याद आ रही है? –

-1

यहां मुझे लगता है कि एक बेहतर समाधान है।

library(stringdist) 
    library(gdata) 
    #Convert numeric words to digits 
isNumericWord=function(string, dist=1, method="dl"){ 
    nums=c("zero","one","two","three","four","five","six","seven","eight","nine", 
     "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen", 
     "twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety", 
     "hundred","thousand","million","billion","trillion") 
    return(any(stringdist(tolower(string),nums,method=method)<=dist)) 
} 
numberTypes=function(string, dist=1, method="dl"){ 
    nums=c("zero","one","two","three","four","five","six","seven","eight","nine", 
     "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen", 
     "twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety", 
     "hundred","thousand","million","billion","trillion") 
    string=gsub("[[:punct:]]"," ",string) 
    wrdsplit=strsplit(string,split=" ")[[1]] 
    wrdsplit=wrdsplit[wrdsplit!=""] 
    #Handle number types 
    wrdsplit=ifelse(stringdist("first",tolower(wrdsplit),method=method)<=dist,"one st",wrdsplit) 
    wrdsplit=ifelse(stringdist("second",tolower(wrdsplit),method=method)<=dist,"two nd",wrdsplit) 
    wrdsplit=ifelse(stringdist("third",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","three rd",wrdsplit) 
    wrdsplit=ifelse(stringdist("fourth",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","four th",wrdsplit) 
    wrdsplit=ifelse(stringdist("fifth",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","five th",wrdsplit) 
    wrdsplit=ifelse(stringdist("sixth",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","six th",wrdsplit) 
    wrdsplit=ifelse(stringdist("seventh",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","seven th",wrdsplit) 
    wrdsplit=ifelse(stringdist("eighth",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","eight th",wrdsplit) 
    wrdsplit=ifelse(stringdist("ninth",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","nine th",wrdsplit) 
    wrdsplit=ifelse(stringdist("tenth",tolower(wrdsplit),method=method)<=dist,"ten th",wrdsplit) 
    wrdsplit=ifelse(stringdist("twentieth",tolower(wrdsplit),method=method)<=dist,"twenty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("thirtieth",tolower(wrdsplit),method=method)<=dist,"thirty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("fortieth",tolower(wrdsplit),method=method)<=dist,"forty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("fiftieth",tolower(wrdsplit),method=method)<=dist,"fifty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("sixtieth",tolower(wrdsplit),method=method)<=dist,"sixty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("seventieth",tolower(wrdsplit),method=method)<=dist,"seventy th",wrdsplit) 
    wrdsplit=ifelse(stringdist("eightieth",tolower(wrdsplit),method=method)<=dist,"eighty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("ninetieth",tolower(wrdsplit),method=method)<=dist,"ninety th",wrdsplit) 
    #Handle other number words that end in "th" 
    if(length(wrdsplit)>0){ 
    for(i in 1:length(wrdsplit)){ 
     substr_end=substr(wrdsplit[i],(nchar(wrdsplit[i])-1),nchar(wrdsplit[i])) 
     substr_beg=substr(wrdsplit[i],1,(nchar(wrdsplit[i])-2)) 
     if(substr_end=="th" & nchar(wrdsplit[i])!=2 & any(stringdist(tolower(substr_beg),nums,method=method)<=dist)){ 
     wrdsplit[i]=paste(substr_beg, substr_end,sep=" ") 
     } 
    } 
    return(gsub(" "," ",paste(wrdsplit,collapse=" "))) 
    }else{ 
    return("") 
    } 
} 

#Convert number words to digits 
Word2Num=function(string, dist=1, method="dl"){ 
    original=string 
    #Define numbers 
    one_digits = list(zero=0, one=1, two=2, three=3, four=4, five=5, 
        six=6, seven=7, eight=8, nine=9) 
    teens = list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15, 
       sixteen=16, seventeen=17, eighteen=18, nineteen=19) 
    ten_digits = list(ten=10, twenty=20, thirty=30, forty=40, fifty=50, 
        sixty=60, seventy=70, eighty=80, ninety=90) 
    large_digits = list(hundred=100, thousand=1000, million=1e6, billion=1e9, trillion=1e12) 
    double_digits = c(teens,ten_digits) 

    #Split the string into words 
    string=gsub("-"," ",gsub(" & ", " and ",string,ignore.case=T)) 
    string=numberTypes(string) 
    wrdsplit=strsplit(tolower(string)," ")[[1]] 
    wrdsplit=wrdsplit[wrdsplit!=""] 
    isNumber=apply(data.frame(wrdsplit),1,isNumericWord) 

    #Find groups of numbers 
    if(exists("groups")){ 
    suppressWarnings(rm(groups)) 
    } 
    i=1 
    while(i <= length(wrdsplit)){ 
    if(isNumber[i]==T){ 
     if(!exists("groups")){ 
     groups=list(wrdsplit[i]) 
     }else if(exists("groups")){ 
     groups=c(groups, wrdsplit[i]) 
     } 
     for(j in (i+1):length(wrdsplit)){ 
     if(isNumber[j]){ 
      groups[[length(groups)]]=c(groups[[length(groups)]],wrdsplit[j]) 
      i=j+1 
     }else{ 
      i=i+1 
      break 
     } 
     } 
    }else{ 
     i=i+1 
    } 
    } 

    #Convert numeric words to numbers 
    if(exists("groups")){ 
    groupNums=groups 
    for(j in 1:length(groups)){ 
     for(i in 1:length(groups[[j]])){ 
     #If word is a single digit number 
     if(any(stringdist(groups[[j]][i],names(one_digits),method=method)<=dist & 
       tolower(substr(groups[[j]][i],nchar(groups[[j]][i]),nchar(groups[[j]][i])))!="y")){ 
      #If word is a single digit number 
      groupNums[[j]][i]=one_digits[stringdist(groups[[j]][i],names(one_digits),method=method)<=dist][[1]] 
     }else if(any(stringdist(groups[[j]][i],names(double_digits),method=method)<=dist)){ 
      #If word is a double digit number 
      groupNums[[j]][i]=double_digits[stringdist(groups[[j]][i],names(double_digits),method=method)<=dist][[1]] 
     }else if(any(stringdist(groups[[j]][i],names(large_digits),method=method)<=dist)){ 
      #If word is a large digit number 
      groupNums[[j]][i]=large_digits[stringdist(groups[[j]][i],names(large_digits),method=method)<=dist][[1]] 
     } 
     } 
    } 

    #Convert the separated numbers to a single number 
    defscipen=options("scipen")[[1]] 
    options(scipen=999) 
    for(i in 1:length(groups)){ 
     if(length(groupNums[[i]])==1){ 
     groupNums[[i]]=as.numeric(groupNums[[i]][1]) 
     }else{ 
     while(length(groupNums[[i]])>=2){ 
      if(nchar(groupNums[[i]][2])>nchar(groupNums[[i]][1])){ 
      #If the next word has more digits than the current word, multiply them 
      temp=as.numeric(groupNums[[i]][1])*as.numeric(groupNums[[i]][2]) 
      }else if(nchar(groupNums[[i]][2])<nchar(groupNums[[i]][1])){ 
      #if the next word has less digits than the current word, add them 
      temp=as.numeric(groupNums[[i]][1])+as.numeric(groupNums[[i]][2]) 
      } 
      #Combine the results 
      if(length(groupNums[[i]])>2){ 
      groupNums[[i]]=c(temp, groupNums[[i]][3:length(groupNums[[i]])]) 
      }else{ 
      groupNums[[i]]=temp 
      } 
     } 
     } 
    } 
    #Recreate the original string 
    groupNums=lapply(groupNums, as.character) 
    options(scipen=defscipen) 
    for(i in 1:length(groups)){ 
     wrdsplit[which(wrdsplit==groups[[i]][1])]=groupNums[[i]][1] 
     if(length(groups[[i]]>1)){ 
     wrdsplit[which(wrdsplit==groups[[i]][2:length(groups)])]="" 
     } 
    } 
    #Combine numbers with their endings 
    wrdsplit=wrdsplit[wrdsplit!=""] 
    if(any(wrdsplit[which(wrdsplit %in% unlist(groupNums))+1] %in% c("rd","th","st","nd"))){ 
     locs=which(wrdsplit %in% unlist(groupNums)) 
     for(i in length(locs):1){ 
     wrdsplit[locs[i]]=paste(wrdsplit[c(locs[i],(locs[i]+1))],collapse="") 
     wrdsplit=wrdsplit[-(locs[i]+1)] 
     } 
    } 
    return(trim(paste(wrdsplit,collapse=" "))) 
    }else{ 
    return(original) 
    } 
}