css

Thursday, January 26, 2012

Detect the language of text in coldfusion

On occasion i have the opportunity to write something fun. For a project i am currently working on i need to determine / detect the language of text stored in a database. I googled for solutions for determining the language of a string but none of the solutions were easy to implement. So i decided to write a component to solve this problem. Basically i created an array which contains often used words in different languages. Then the function looks for these words in the text for which the language needs to be determined. Based on the number of occurences of the words in the text the language is returned. If you have a question just place a comment.

<cfcomponent name="detectLanguage">
 <cfset Variables.dl_languages = arrayNew(1)>
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)+1] = structNew()>
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].description = "Engels">
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].lan = "EN">
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].words = "swimming pool,environment,satisfied,quiet,food,staff,location,delicious,room,rooms,beautiful,nice,friendly,breakfast,bad,well,price,prices,what,she,water,now,one,day,their,look,not,been,word,has,that,are,him,made,than,this,had,get,each,you,his,into,about,will,number,but,your,people,there,many,could,more,out,its,call,use,down,from,these,the,were,part,may,find,for,how,they,and,can,time,them,then,see,her,way,some,all,was,have,said,with,like,which,other,would,make,two,oil,write,did,come,when,who,long,first">

 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)+1] = structNew()>
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].description = "Nederlands">
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].lan = "NL">
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].words = "zwembad,omgeving,tevreden,rustig,rustige,eten,personeel,locatie,heerlijk,kamer,kamers,prachtig,leuk,vriendelijk,vriendelijke,ontbijt,slecht,goed,prijs,doen,dag,later,voor,worden,tijd,laatst,werk,zullen,maken,een,zeggen,nieuw,plaats,aan,hebben,dat,goed,jaar,kunnen,hij,aantal,niet,het,klein,die,man,leven,groot,was,van,mensen,moeten,heel,komen,vrouw,ook,zijn,lang">

 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)+1] = structNew()>
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].description = "Duits">
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].lan = "DE">
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].words = "schwimmbad,umwelt,zufrieden,ruhig,essen,personal,lage,lecker,zimmer,schön,freundlich,frühstück,schlecht,gut,preis,preise,sehen,über,selber,wenn,mein,mehr,machen,unser,durch,anderer,das,anderes,groß,immer,wissen,sie,sollen,jede,schon,beispiel,gut,eigentlich,sein,weil,werden,wieder,geben,stehen,also,damit,denn,sehr,bei,andere,gehen,selbst,viel,jedes,jeder,lang,können,doch,unter,aus,ihr,auf,haben,sich,der,kein,zeit,oben,ihm,ihn,noch,jetzt,als,hier,aber,all,was,wir,dann,und,jahr,dass,nach,nur,mir,lassen,uns,zwei,sagen,oder,erstes,mit,erster,kommen,ganz,neu,ein,mich,mal,ich,dies,von,wollen,bis,müssen,nicht,erste,auch,für,vor">

 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)+1] = structNew()>
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].description = "Frans">
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].lan = "FR">
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].words = "piscine,l'environnement,satisfaits,calme,alimentaires,personnel,lieu,délicieux,chambre,chambres,belle,amicale,petit déjeuner,mauvaise,ainsi,prix,elle,voir,dire,mon,demander,non,quelque,raison,aussi,depuis,sans,pendant,moins,monsieur,peu,même,monde,dont,trouver,celui,heure,premier,alors,être,autre,mettre,bien,dans,leur,qui,donner,que,jour,aucun,dernier,nouveau,passer,suite,falloir,personne,temps,sur,mais,deux,point,pour,avant,part,devoir,très,ainsi,vous,aller,comme,cela,nous,rendre,lequel,bon,notre,par,plus,avec,fois,encore,entre,avoir,lui,savoir,pouvoir,vouloir,venir,grand,déjà,comprendre,rester,prendre,tout,son,faire,après">

 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)+1] = structNew()>
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].description = "Spaans">
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].lan = "ES">
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].words = "piscina,ambiente,satisfecho,tranquilo,comida,personal,ubicación,delicioso,habitación,habitaciones,hermoso,bueno,amistoso,desayuno,malo,bien,precio,precios,pasar,también,hacer,menos,ese,porque,haber,ahora,desde,poder,seguir,llevar,hasta,donde,nuestro,nos,después,primero,nuevo,hablar,dar,querer,vida,para,otro,poner,poco,sin,encontrar,siempre,tan,sobre,por,con,vez,entonces,decir,dos,ver,eso,ella,entre,deber,dejar,así,tanto,grande,cada,cuando,llegar,año,muy,ser,mismo,tiempo,todo,más,como,bien,uno,nada,parte,creer,cosa,tener,saber,que,mucho,parecer,qué,este,hombre,estar,día,pero,quedar,alguno">

 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)+1] = structNew()>
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].description = "Italiaans">
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].lan = "IT">
 <cfset Variables.dl_languages[arraylen(Variables.dl_languages)].words = "piscina,ambiente,soddisfatto,tranquillo,cibo,personale,posizione,delizioso,camera,camere,bello,bella,amichevole,prima colazione,bene,prezzo,prezzi,molto,detto,tutti,dire,tutto,non,noi,quello,più,bene,avere,hai,tuo,con,posso,andare,essere,due,ché,voglio,sono,alla,niente,qui,cosa,così,della,gli,quando,stare,solo,dei,del,fatto,volere,suo,sei,dove,fare,perché,sua,sta,anche,hanno,sapere,allora,questo,vuoi,una,mia,lui,stato,per,questa,che,ancora,chi,mio,grazie,come,uno,ora,mai,nel,era,siamo,lei,potere,abbiamo">

 <cffunction name="lan">
  <cfargument name="in">
  <cfset var result = structNew()>
  <cfset var wordsInString = arraylen(reMatch("[[:word:]]+", in))>



  <cfset result.languages = arrayNew(1)>
  <cfset result.success = false>
  <cfset result.absolute = 0>
  <cfset result.relative = 0>
  <cfset result.message = "">
  

  <cfloop from=1 to="#arrayLen(dl_languages)#" index="l">
   <cfset result.languages[arraylen(result.languages)+1] = arrayNew(1)>
   <cfset result.languages[arraylen(result.languages)][1] = dl_languages[l].lan>
   <cfset result.languages[arraylen(result.languages)][2] = 0>
 
   <cfloop list="#dl_languages[l].words#" index="w">
    <cfset result.languages[arraylen(result.languages)][2] = result.languages[arraylen(result.languages)][2] + #evaluate((Len(trim(in)) - Len(reReplace(lcase(trim(in)),'(^|\s)(#w#)(\s|\.|\,|$)','\1\3','all'))) / Len(#w#))#>
   </cfloop>
  </cfloop>

  <cfloop index="outer" from="1" to="#arrayLen(result.languages)#">
   <cfloop index="inner" from="1" to="#arrayLen(result.languages)-1#">
    <cfif result.languages[inner][2] lt result.languages[outer][2]>
     <cfset arraySwap(result.languages,outer,inner)>
    </cfif>
   </cfloop>
  </cfloop>

  <cfset result.wordsInString = wordsInString>

  <cfif result.languages[1][2] gt 0 and result.languages[1][2] neq result.languages[2][2]>
   <cfset result.success = true>
   <cfset result.absolute = result.languages[1][2]>
   <cfif result.wordsInString gt 0>
    <cfset result.relative = ceiling(result.absolute/result.wordsInString*100)>
   </cfif>
   <cfset result.lan = result.languages[1][1]>
  <cfelse>
   <cfset result.lan = "NA">

   <cfif result.languages[1][2] eq 0>
    <cfset result.message = "No matches">
   <cfelseif result.languages[1][2] eq result.languages[2][2]>
    <cfset result.message = "Multiple matches">
   </cfif>
  </cfif>
  <cfreturn result>
 </cffunction>
</cfcomponent>

No comments: