% Copyright (C) 2002 David Roundy % % This program is free software; you can redistribute it and/or modify % it under the terms of the GNU General Public License as published by % the Free Software Foundation; either version 2, or (at your option) % any later version. % % This program is distributed in the hope that it will be useful, % but WITHOUT ANY WARRANTY; without even the implied warranty of % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the % GNU General Public License for more details. % % You should have received a copy of the GNU General Public License % along with this program; if not, write to the Free Software Foundation, % Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. \chapter{LCS} \section{Introduction} ``LCS'' stands for ``Longest Common Subsequence,'' and it is a relatively challenging problem to find an LCS efficiently. I'm not going to explain here what an LCS is, but will point out that it is useful in finding how two sequences (lists, in this case) differ. This module implements the Hunt-Szymanski algorithm, which is appropriate for applications in which the sequence is on an infinite alphabet, such as diffing the lines in two files, where many, or most lines are unique. In the best case scenario, a permutation of unique lines, this algorithm is $O(n\log n)$. In the worst case scenario, that of a finite alphabet (i.e.\ where the number of elements in the sequence is much greater than the number of unique elements), it is an $O(n^2\log n)$ algorithm, which is pretty terrible. You should probably be aware that most diff programs do \emph{not} find an lcs. Instead they use a faster algorithm, which doesn't give an optimal diff, but does give one quickly. \begin{code} module Lcs ( lcs, subtract_subsequence ) where import List ( sort ) import Data.Array.ST import Control.Monad.ST import FastPackedString lcs :: Ord a => [a] -> [a] -> [a] {-# SPECIALIZE lcs ::[String] -> [String] -> [String] #-} {-# SPECIALIZE lcs ::[PackedString] -> [PackedString] -> [PackedString] #-} \end{code} In order to make use of our lcs, we will want to be able to remove it from the strings (to see what has changed). That is what subtract\_subsequence does for us. The first argument is the sequence, the second the subsequence to remove. \begin{code} subtract_subsequence :: Eq a => [a] -> [a] -> [a] subtract_subsequence s [] = s subtract_subsequence [] _ = error "Lcs.subtract_subsequence: Too much subsequence!" subtract_subsequence (c:cs) (s:ss) | c == s = subtract_subsequence cs ss | otherwise = c : subtract_subsequence cs (s:ss) \end{code} \begin{code} lcs [] _ = [] lcs _ [] = [] lcs (c1:c1s) (c2:c2s) | c1 == c2 = c1: lcs c1s c2s | otherwise = reverse $ lcs_simple (reverse (c1:c1s)) (reverse (c2:c2s)) lcs_simple :: Ord a => [a] -> [a] -> [a] lcs_simple [] _ = [] lcs_simple _ [] = [] lcs_simple s1@(c1:c1s) s2@(c2:c2s) | c1 == c2 = c1: lcs c1s c2s | otherwise = case unzip $ prune_matches s1 $! find_matches s1 s2 of (s1',m1') -> hunt s1' m1' prune_matches :: [a] -> [[Int]] -> [(a, [Int])] prune_matches _ [] = [] prune_matches [] _ = [] prune_matches (_:cs) ([]:ms) = prune_matches cs ms prune_matches (c:cs) (m:ms) = (c,m): prune_matches cs ms type Threshold s a = STArray s Int (Int,[a]) hunt :: [a] -> [[Int]] -> [a] hunt [] _ = [] hunt cs matches = runST (do th <- empty_threshold (length cs) l hunt_internal cs matches th hunt_recover th (-1) l ) where l = foldl max 0 $ concat matches hunt_internal :: [a] -> [[Int]] -> Threshold s a -> ST s () hunt_internal [] _ _ = return () hunt_internal _ [] _ = return () hunt_internal (c:cs) (m:ms) th = do hunt_one_char c m th hunt_internal cs ms th hunt_one_char :: a -> [Int] -> Threshold s a -> ST s () hunt_one_char _ [] _ = return () hunt_one_char c (j:js) th = do index_k <- my_bs j th case index_k of Nothing -> return () Just k -> do (_, rest) <- readArray th (k-1) writeArray th k (j, c:rest) hunt_one_char c js th -- This is O(n), which is stupid. hunt_recover :: Threshold s a -> Int -> Int -> ST s [a] hunt_recover th n limit | n < 0 = hunt_recover th th_max limit | n == 0 = return [] | n > th_max = return [] | otherwise = do (thn, sn) <- readArray th n if thn <= limit then return $ reverse sn else hunt_recover th (n-1) limit where (_th_min, th_max) = bounds th empty_threshold :: Int -> Int -> ST s (Threshold s a) empty_threshold l th_max = do th <- newArray (0,l) (th_max+1, []) writeArray th 0 (0, []) return th my_bs :: Int -> Threshold s a -> ST s (Maybe Int) my_bs j th = my_helper_bs j (bounds th) th my_helper_bs :: Int -> (Int,Int) -> Threshold s a -> ST s (Maybe Int) my_helper_bs j (th_min,th_max) th = if th_max - th_min > 1 then do (midth, _) <- readArray th th_middle if j > midth then my_helper_bs j (th_middle,th_max) th else my_helper_bs j (th_min,th_middle) th else do (minth, _) <- readArray th th_min (maxth, _) <- readArray th th_max if minth < j && maxth > j then return $ Just th_max else if j < minth then return $ Just th_min else return Nothing where th_middle = (th_max+th_min) `div` 2 \end{code} \begin{code} find_matches :: Ord a => [a] -> [a] -> [[Int]] find_matches [] [] = [] find_matches [] (_:bs) = []: find_matches [] bs find_matches _ [] = [] find_matches a b = unzip_indexed $ sort $ find_sorted_matches indexeda indexedb [] [] where indexeda = sort $ zip a [1..] indexedb = sort $ zip b [1..] unzip_indexed :: [(Int,[a])] -> [[a]] unzip_indexed s = unzip_indexed_helper 1 s where unzip_indexed_helper _ [] = [] unzip_indexed_helper thisl ((l,c):rest) | thisl == l = c: unzip_indexed_helper (l+1) rest | otherwise = []: unzip_indexed_helper (thisl+1) ((l,c):rest) find_sorted_matches :: Ord a => [(a, Int)] -> [(a, Int)] -> [a] -> [Int] -> [(Int, [Int])] find_sorted_matches [] _ _ _ = [] find_sorted_matches _ [] _ _ = [] find_sorted_matches ((a,na):as) ((b,nb):bs) aold aoldmatches | [a] == aold = (na, aoldmatches) : find_sorted_matches as ((b,nb):bs) aold aoldmatches | a > b = find_sorted_matches ((a,na):as) bs aold aoldmatches | a < b = find_sorted_matches as ((b,nb):bs) aold aoldmatches -- following line is inefficient if a line is repeated many times. | otherwise -- a == b = case reverse $ find_matches_one a ((b,nb):bs) of matches -> (na, matches) : find_sorted_matches as ((b,nb):bs) [a] matches find_matches_one :: Eq a => a -> [(a, Int)] -> [Int] find_matches_one _ [] = [] find_matches_one a ((b,nb):bs) | a == b = nb: find_matches_one a bs | otherwise = [] \end{code}