Liang’s hyphenation algorithm implementation in node.js

Currently I am involved in a project that requires some string hyphenation. Initially I took a shot by using Liang’s hyphenation algorithm (taken from here liang hyphenation in python).
However it could not match my needs therefore I had to switch to the one open-office uses.

Anyway here it is the javascript implementation as a node.js module.


function LiangHyphenator(patterns) {

    this.tree = {}
    this.patterns = patterns

    for(var i= 0;i<patterns.length;i++) {
        var pattern = patterns[i]

        this.__insertPattern(pattern)
    }

}

LiangHyphenator.prototype.__insertPattern = function(pattern) {

    var chars = this.__clearPattern(pattern)
    var points = this.__createPoints(pattern)

    this.__addPatternToTree(points,chars)
}

LiangHyphenator.prototype.__clearPattern = function(pattern) {
    var numericsExpression = new RegExp('[0-9]','g')
    return pattern.replace(numericsExpression,'')
}

LiangHyphenator.prototype.__createPoints = function(pattern) {

    var charExpression = new RegExp('[.a-z]','g')
    var splitted = pattern.split(charExpression)

    for(var i= 0;i<splitted.length;i++) {
        if(splitted[i]==='') {
            splitted[i]=0
        } else {
            splitted[i] = parseInt(splitted[i])
        }
    }

    return splitted
}

LiangHyphenator.prototype.__addPatternToTree = function(points,chars) {
    var tree = this.tree
    for(var i=0;i<chars.length;i++) {

        var c = chars[i]
        if(!tree[c]) {
            tree[c] = {}
        }
        tree = tree[c]

    }


    tree['None'] = points
}

LiangHyphenator.prototype.hyphenateWord = function(word) {
    if(word.length<=4) {
        return [word]
    }

    var work = '.'+word.toLowerCase()+'.'

    var points = this.__createZeroArray(work.length+1)

    var tree = {}

    for(var j=0;j<work.length;j++) {

        var restWord = work.slice(j)
        tree = this.tree

        for(var i=0;i<restWord.length;i++) {
            var char = restWord[i]
            if(tree[char]) {
                tree = tree[char]
                if(tree['None']) {
                    var p = tree['None']
                    for(var pi=0;pi< p.length;pi++) {
                        points[pi+j] = Math.max(points[pi+j],p[pi])
                    }
                }
            } else {
                break
            }
        }
    }

    points[1] = 0
    points[2] = 0
    points[points.length-2] = 0
    points[points.length-3] = 0

    var pieces = ['']
    var zipped = this.__zip([word.split(''),points.slice(2)])

    for(var i=0;i<zipped.length;i++) {
        var c = zipped[i][0]
        var p = zipped[i][1]

        pieces[pieces.length-1] += c

        if(p%2!=0) {
            pieces.push('')
        }
    }

    return pieces

}

LiangHyphenator.prototype.__createZeroArray = function(size) {

    zeroArray = []

    for(var i=0;i<size;i++) {
        zeroArray.push(0)
    }

    return zeroArray
}

LiangHyphenator.prototype.__zip = function (arrays) {
    var serial = Array.apply(null,Array(arrays[0].length)).map(function(_,i){
        return arrays.map(function(array){return array[i]})
    });

    return serial
}


module.exports = LiangHyphenator
Advertisement

2 thoughts on “Liang’s hyphenation algorithm implementation in node.js

  1. Just pointing out the obvious here:
    This code does not handle the words in exception lists (Greek has no word exceptions, but other languages do).
    Apart from that, nothing else is missing from the implementation, is there?
    I am thinking of porting this code to C# and I am trying to figure out if this is a complete implementation.

    Thanks for posting this. 🙂
    The python code made me dizzy and the Java codes out there are a complete mess. 😦

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

This site uses Akismet to reduce spam. Learn how your comment data is processed.