Currently I am involved in a project that requires some string hyphenation. Initially I took a shot by using Liang’s hyphenation algorithm (taken from here liang hyphenation in python).
However it could not match my needs therefore I had to switch to the one open-office uses.
Anyway here it is the javascript implementation as a node.js module.
function LiangHyphenator(patterns) { this.tree = {} this.patterns = patterns for(var i= 0;i<patterns.length;i++) { var pattern = patterns[i] this.__insertPattern(pattern) } } LiangHyphenator.prototype.__insertPattern = function(pattern) { var chars = this.__clearPattern(pattern) var points = this.__createPoints(pattern) this.__addPatternToTree(points,chars) } LiangHyphenator.prototype.__clearPattern = function(pattern) { var numericsExpression = new RegExp('[0-9]','g') return pattern.replace(numericsExpression,'') } LiangHyphenator.prototype.__createPoints = function(pattern) { var charExpression = new RegExp('[.a-z]','g') var splitted = pattern.split(charExpression) for(var i= 0;i<splitted.length;i++) { if(splitted[i]==='') { splitted[i]=0 } else { splitted[i] = parseInt(splitted[i]) } } return splitted } LiangHyphenator.prototype.__addPatternToTree = function(points,chars) { var tree = this.tree for(var i=0;i<chars.length;i++) { var c = chars[i] if(!tree[c]) { tree[c] = {} } tree = tree[c] } tree['None'] = points } LiangHyphenator.prototype.hyphenateWord = function(word) { if(word.length<=4) { return [word] } var work = '.'+word.toLowerCase()+'.' var points = this.__createZeroArray(work.length+1) var tree = {} for(var j=0;j<work.length;j++) { var restWord = work.slice(j) tree = this.tree for(var i=0;i<restWord.length;i++) { var char = restWord[i] if(tree[char]) { tree = tree[char] if(tree['None']) { var p = tree['None'] for(var pi=0;pi< p.length;pi++) { points[pi+j] = Math.max(points[pi+j],p[pi]) } } } else { break } } } points[1] = 0 points[2] = 0 points[points.length-2] = 0 points[points.length-3] = 0 var pieces = [''] var zipped = this.__zip([word.split(''),points.slice(2)]) for(var i=0;i<zipped.length;i++) { var c = zipped[i][0] var p = zipped[i][1] pieces[pieces.length-1] += c if(p%2!=0) { pieces.push('') } } return pieces } LiangHyphenator.prototype.__createZeroArray = function(size) { zeroArray = [] for(var i=0;i<size;i++) { zeroArray.push(0) } return zeroArray } LiangHyphenator.prototype.__zip = function (arrays) { var serial = Array.apply(null,Array(arrays[0].length)).map(function(_,i){ return arrays.map(function(array){return array[i]}) }); return serial } module.exports = LiangHyphenator
Just pointing out the obvious here:
This code does not handle the words in exception lists (Greek has no word exceptions, but other languages do).
Apart from that, nothing else is missing from the implementation, is there?
I am thinking of porting this code to C# and I am trying to figure out if this is a complete implementation.
Thanks for posting this. 🙂
The python code made me dizzy and the Java codes out there are a complete mess. 😦
Yes it is a complete implementation of the algorithm but it is up to you to add any extra features such as word exceptions. In my case I took what I needed (stop words etc. )from the lucene source code https://github.com/apache/lucene-solr