Posts Tagged ‘algorithm’

Java example of Hamming distance

June 28, 2010

This algorithm calculates the distance between two strings, however they have to be of equal length.

It measures the minimum number of substitutions for the two strings to be equal.

It is used in telecommunication (also know as signal distance), it is also used in systematics as a measure of genetic distance.

Wikipedia entry can be found here:


public class Hamming
{
    private String compOne;
    private String compTwo;

    public Hamming(String one, String two)
    {
        compOne = one;
        compTwo = two;
    }

    ///
    //  Calculating the Hamming Distance for two strings requires the string to be of the same length.
    ///
    public int getHammingDistance()
    {
        if (compOne.length() != compTwo.length())
        {
            return -1;
        }

        int counter = 0;

        for (int i = 0; i < compOne.length(); i++)
        {
            if (compOne.charAt(i) != compTwo.charAt(i)) counter++;
        }

        return counter;
    }

    ///
    //  Hamming distance works best with binary comparisons, this function takes a string arrary of binary
    //  values and returns the minimum distance value
    ///
    public int minDistance(String[] numbers)
    {
        int minDistance = Integer.MAX_VALUE;

        if (checkConstraints(numbers))
        {
            for (int i = 1; i < numbers.length; i++)
            {
                int counter = 0;
                for (int j = 1; j <= numbers[i].length(); j++)
                {
                    if (numbers[i-1].charAt(j-1) != numbers[i].charAt(j-1))
                    {
                        counter++;
                    }
                }

                if (counter == 0) return counter;
                if (counter < minDistance) minDistance = counter;
            }
        }
        else
        {
            return -1;
        }

        return minDistance;
    }

    private Boolean checkConstraints(String[] numbers)
    {
        if (numbers.length > 1 && numbers.length <=50)
        {
            int prevLength = -1;
            for (int i = 0; i < numbers.length; i++)
            {
                if (numbers[i].length() > 0 && numbers[i].length() <= 50)
                {
                    if (prevLength == -1)
                    {
                        prevLength = numbers[i].length();
                    }
                    else
                    {
                        if (prevLength != numbers[i].length())
                        {
                            return false;
                        }
                    }

                    for (int j = 0; j < numbers[i].length(); j++)
                    {
                        if (numbers[i].charAt(j) != '0' && numbers[i].charAt(j) != '1')
                        {
                            return false;
                        }
                    }
                }
                else
                {
                    return false;
                }
            }
        }
        else
        {
            return false;
        }

        return true;
    }
}

Advertisements

Java example of Jaro Winkler

June 28, 2010

This algorithm is purposely designed for record linkage, it was designed for linking short strings. It calculates a normalised score on the similarity between two strings.

The calculation is based on the number of matching characters held within the string and the number of transpositions.

The wikipedia entry can be found here, the other source was lingpipe:


public class JaroWinkler
{
    private String compOne;
    private String compTwo;

    private String theMatchA = "";
    private String theMatchB = "";
    private int mRange = -1;
    
    public JaroWinkler()
    {
    }
    
    public JaroWinkler(String s1, String s2)
    {
        compOne = s1;
        compTwo = s2;
    }

    public double getSimilarity(String s1, String s2)
    {
        compOne = s1;
        compTwo = s2;

        mRange = Math.max(compOne.length(), compTwo.length()) / 2 - 1;
        
        double res = -1;

        int m = getMatch();
        int t = 0;
        if (getMissMatch(compTwo,compOne) > 0)
        {
            t = (getMissMatch(compOne,compTwo) / getMissMatch(compTwo,compOne));
        }

        int l1 = compOne.length();
        int l2 = compTwo.length();

        double f = 0.3333;
        double mt = (double)(m-t)/m;
        double jw = f * ((double)m/l1+(double)m/l2+(double)mt);
        res = jw + getCommonPrefix(compOne,compTwo) * (0.1*(1.0 - jw));
        
        return res;
    }

    private int getMatch()
    {
        
        theMatchA = "";
        theMatchB = "";
        
        int matches = 0;

        for (int i = 0; i < compOne.length(); i++)
        {
            //Look backward
            int counter = 0;
            while(counter <= mRange && i >= 0 && counter <= i)
            {
                if (compOne.charAt(i) == compTwo.charAt(i - counter))
                {
                    matches++;
                    theMatchA = theMatchA + compOne.charAt(i);
                    theMatchB = theMatchB + compTwo.charAt(i);
                }
                counter++;                
            }

            //Look forward
            counter = 1;
            while(counter <= mRange && i < compTwo.length() && counter + i < compTwo.length())
            {
                if (compOne.charAt(i) == compTwo.charAt(i + counter))
                {
                    matches++;
                    theMatchA = theMatchA + compOne.charAt(i);
                    theMatchB = theMatchB + compTwo.charAt(i);
                }
                counter++;
            }
        }
        return matches;
    }

    private int getMissMatch(String s1, String s2)
    {
        int transPositions = 0;

        for (int i = 0; i < theMatchA.length(); i++)
        {
            //Look Backward
            int counter = 0;
            while(counter <= mRange && i >= 0 && counter <= i)
            {
                if (theMatchA.charAt(i) == theMatchB.charAt(i - counter) && counter > 0)
                {
                    transPositions++;
                }
                counter++;
            }

            //Look forward
            counter = 1;
            while(counter <= mRange && i < theMatchB.length() && (counter + i) < theMatchB.length())
            {
                if (theMatchA.charAt(i) == theMatchB.charAt(i + counter) && counter > 0)
                {
                    transPositions++;
                }
                counter++;
            }
        }
        return transPositions;
    }

    private int getCommonPrefix(String compOne, String compTwo)
    {
        int cp = 0;
        for (int i = 0; i < 4; i++)
        {
            if (compOne.charAt(i) == compTwo.charAt(i)) cp++;
        }
        return cp;
    }
}

Java example of Markov Chain

June 28, 2010

The Markov Chain model calculates the probability of the next letter occurring in a sequence based on the previous n characters.

They are used in a multitude of areas, including data compression, entropy encoding, and pattern recognition.

The wikipedia entry can be found here. I based my algorithm off of an excellent explanation found here, provided by Rutgers as part of their course Discrete and Probabilistic Models in Biology. Warning, I think this document has a couple of mistakes in the matrix on page 10 (the counting of A,C,G,T), or I may be mistaken.

For this code, the final part which is needed is a method which gives the probability of a sequence, that is you pass in a sequence (e.g. AACGT) and it returns the probability of this sequence based on the matrix it has calculated.

The code:


public class Markov
{

    private class similarities
    {
        private char c;
        private int count;
        
        public similarities(char a, int b)
        {
            c = a;
            count = b;
        }
        
        public char getC()
        {
            return c;
        }
        
        public int getCount()
        {
            return count;
        }
        
        public void setCount(int a)
        {
            count = a;
        }
        
        public void increaseCount()
        {
            count++;
        }
    }

    private List<similarities> entries;
    private List<String> axies;
    private double[][] matrix;

    public Markov()
    {
        entries = new ArrayList();
        axies = new ArrayList();
    }

    public void getSimilarity(List<String> s)
    {
        String temp = new String();
        for (int i = 0; i < s.size(); i++)
        {
            if (temp.length() > 0)
            {
                temp = temp + " " + s.get(i);
            }
            else temp = s.get(i);

        }
        setupAxis(temp);
        compareCharsAgainstAxis(temp);

        display();

        for (int i = 0; i < matrix.length; i++)
        {
            double line = getLineValue(i);
            for (int j = 0; j < matrix[i].length; j++)
            {
                double t = matrix[i][j];

                matrix[i][j] = t / line;
            }
        }

        display();
    }

    private void compareCharsAgainstAxis(String s)
    {
        matrix = new double[axies.size()][axies.size()];
        String comparison = "";
        String test = s;

        for (int h = 0; h < test.length(); h++)
        {
            int end = test.indexOf(" ");
            if (end > 0)
            {
                comparison = test.substring(0,end);
            }
            else 
            {
                comparison = test.substring(0,test.length());
                end = test.length()-1;
            }
            test = test.substring(end+1,test.length());

            for (int i = 1; i < comparison.length(); i++)
            {
                for (int j = 0; j < axies.size(); j++)
                {
                    for (int k = 0; k < axies.size(); k++)
                    {
                        if (String.valueOf(comparison.charAt(i-1)).equalsIgnoreCase(axies.get(j)) && String.valueOf(comparison.charAt(i)).equalsIgnoreCase(axies.get(k)))
                        {
                            matrix[j][k]++;
                        }
                    }
                }
            }
        }

    }
    private void setupAxis(String temp)
    {
        for (int j = 0; j < temp.length(); j++)
        {
            boolean found = false;

            for (int k = 0; k < axies.size() && !found; k++)
            {
                if (String.valueOf(temp.charAt(j)).equalsIgnoreCase(axies.get(k)))
                {
                    found = true;
                }
            }

            if (!found)
            {
                if (temp.charAt(j) > 32)
                {
                    axies.add(String.valueOf(temp.charAt(j)));
                }
            }
        }
    }

    private double getTotalValue()
    {
        double sum = 0;
        for (int i = 0; i < matrix.length; i++)
        {
            for (int j = 0; j < matrix[i].length; j++)
            {
                double t = matrix[i][j];
                sum = sum + t;
            }
        }
        return sum;
    }

    private double getLineValue(int line)
    {
        double sum = 0;
        for (int i = 0; i < matrix[line].length; i++)
        {
            double t = matrix[line][i];
            sum = sum + t;
        }
        return sum;
    }

    private void display()
    {
        System.out.println("Sum of matrix is "+getTotalValue());
        System.out.println("Sum of line 1 = "+getLineValue(3));

        System.out.print("  ");
        for (int j = 0; j < axies.size(); j++)
        {
            System.out.print(axies.get(j)+" ");
        }
        System.out.println();
        for (int i = 0; i < matrix.length; i++)
        {
            System.out.print(axies.get(i)+" ");
            for (int j = 0; j < matrix[i].length; j++)
            {
                System.out.print(matrix[i][j]+" ");
            }
            System.out.println();
        }
    }
}