Rev. | 0ea264aff68f7ed684d29045d69242ebb498b40e |
---|---|
Tamanho | 8,440 bytes |
Hora | 2010-09-14 07:18:41 |
Autor | lorenzo |
Mensagem de Log | I cleaned up a bit the code and removed unnecessary tests/calculations
|
import Data.Ord
import Data.List
-- The module imported above contains a lot of functions to manipulate lists
import Numeric.LinearAlgebra
import qualified Data.Set as Set
main :: IO ()
main = do
txt <- readFile "couple_interaction_duration_times_extended_1_.dat"
let dat :: [[Integer]]
dat = convert txt -- Now dat is a list where every element is a row of the original data table
let dat_col = transpose dat -- now every entry of data_col is a column of the original data table
let tag_list = get_tag_list dat_col
-- putStrLn "tag_list is, "
-- print tag_list
-- let tag_1080_pos = get_single_tag_story dat_col 1080
-- putStrLn "positions of tag 1080, "
-- print tag_1080_pos
-- let t_1080= get_sel (dat_col!!1) tag_1080_pos
-- writeFile "ini_1080.dat" t_1080
-- putStrLn "the times of the begin of the contacts of tag 1080 are, "
-- print (take 30 t_1080 )
-- let t_1080_2 = single_tag_unique_contact_times dat_col 1080
-- putStrLn "or also "
-- print (take 30 t_1080_2 )
-- save "1080_hs.dat" t_1080
-- putStrLn "nub t_1080==nub (sort t_1080) is,"
-- print (( t_1080)==( (sort t_1080) ))
let unique_time_long = map (single_tag_unique_contact_times dat_col) tag_list
save_vector_flat "unique_time_long.dat" unique_time_long
-- putStrLn "unique_time_long is, "
-- print (unique_time_long)
-- let ctime = count t_1080
-- putStrLn "the repeated times are, "
-- print (take 10 ctime )
-- let n_occ = count_occurrencies t_1080
-- putStrLn "the occurrencies are, "
-- print ( n_occ )
-- let times_multi_contacts = find_repeated_times n_occ t_1080
-- putStrLn "the times at which multiple contacts are established are, "
-- print ( times_multi_contacts )
-- let duration_times = get_single_tag_contact_durations dat_col tag_1080_pos
-- putStrLn "contact durations are"
-- print (take 30 duration_times )
-- let summed_times = sum_corresponding_unique t_1080 duration_times
-- putStrLn "the summed duration times for multiple contacts starting at the same time are"
-- print (take 30 summed_times )
-- let summed_times_2 = single_tag_unique_contact_durations dat_col 1080
-- putStrLn "or also"
-- print (take 30 summed_times_2 )
let summed_durations_long = map (single_tag_unique_contact_durations dat_col) tag_list
save_vector_flat "unique_contact_durations_long.dat" summed_durations_long
let id_list_long = map (stick_tag_id dat_col) tag_list
save_vector_flat "tag_id_long.dat" id_list_long
-- It looks like the little script is really OK up to here!
-- Now sort the times at which a contact begins in increasing order
let unique_time_long_ordered = map (sort) unique_time_long
save_vector_flat "unique_time_long_ordered.dat" unique_time_long_ordered
let data_comb = zip unique_time_long summed_durations_long
-- let summed_durations_long_ordered = genericIndex data_comb 2
-- let ms = argsort_only (unique_time_long!!1) (summed_durations_long!!1)
let summed_durations_long_ordered = map (argsort_tuple . genericIndex data_comb) [0,1..length(data_comb)-1]
save_vector_flat "unique_contact_durations_long_ordered.dat" summed_durations_long_ordered
putStrLn "summed_durations_long_ordered is, "
print (summed_durations_long_ordered )
-- putStrLn "ms is, "
-- print (ms)
putStrLn "So far so good"
convert x = (map (map read . words) . lines) x
firstColumn xss = head (transpose xss)
nthColumn xss n = (transpose xss) !! n -- my first haskell function!
find_ij xss i j = (xss !! i) !! j -- for the case of this function, each entry of a list of lists (i.e. each sublist) is meant to be
--a row.
get_tag_list xss = nub $ (xss!!4) ++ (xss!!3) -- this automatically removes duplicate entries
get_single_tag_story dat tag_id = (nub ((findIndices (== tag_id) (dat!!3) ) ++ (findIndices (== tag_id) (dat!!4) )))
get_single_tag_contact_durations dat sel = get_sel (dat!!0) sel
-- where sel= sort $ nub ((findIndices (== tag_id) (dat!!3) ) ++ (findIndices (== tag_id) (dat!!4) ))
get_single_tag_contact_durations_tag_id dat tag_id = get_sel (dat!!0) sel
where sel= nub ((findIndices (== tag_id) (dat!!3) ) ++ (findIndices (== tag_id) (dat!!4) ))
single_tag_unique_contact_times dat_col tag_id = nub $ get_sel (dat_col!!1) sel
where sel = (nub ((findIndices (== tag_id) (dat_col!!3) ) ++ (findIndices (== tag_id) (dat_col!!4) )))
single_tag_non_unique_contact_times dat_col tag_id = get_sel (dat_col!!1) sel
where sel= nub ((findIndices (== tag_id) (dat_col!!3) ) ++ (findIndices (== tag_id) (dat_col!!4) ))
stick_tag_id dat_col tag_id = take n [tag_id,tag_id..]
where n= length $ single_tag_unique_contact_times dat_col tag_id
-- The following function is quite crucial as it combines several different functions
-- and given the list of data (in column format) and a tag id, it returns the list of
-- tag contact durations
single_tag_unique_contact_durations dat_col tag_id = sum_corresponding_unique bs ms
where
bs = single_tag_non_unique_contact_times dat_col tag_id
ms = get_single_tag_contact_durations_tag_id dat_col tag_id
find_repeated_times occurrences timelist = get_sel (sort(nub timelist)) sel
where sel = findIndices (>1) occurrences -- NB: to select the repeated elements in timelist I
-- need to make the timelist unique and to sort it (since
-- the function count_occurrencies to see how many time each unique entry of a list is repeated,
-- the list gets sorted in increasing order.)
-- Now I want to sum the durations corresponding to repeated times
sum_corresponding_unique bs ms = [sum [m | (b,m)<- zip bs ms, b == u] | u<- us]
where us = nub ( bs)-- where ms is the list of the elements
-- I want to sum for the corresponding repeated elements in bs.
-- NB2: this just works, I do not need to sort anything
-- A function returning a list with the elements of ml in position sel
get_sel ml sel = map (genericIndex ml) (sort sel) -- NB: sel has to be sorted in increasing order or there may be problems
-- and I also need genericIndex as !! does not work with Integer numbers (only with Int)
-- The one below is a more complicated function which probably do not understand completely hence I do not use it
selection :: Integral a => [a] -> [b] -> [b]
selection sel = pick distances -- NB: sel has to be increasing!
where
distances = zipWith (-) (sort sel) (0:(sort sel))
pick [] _ = []
pick (d:ds) xs = case genericDrop d xs of
[] -> []
ys@(y:_) -> y : pick ds ys
argsort bs ms = unzip . sortBy (comparing fst) $ zip bs ms -- where ms is the list which has to be sorted according to the
-- increasing order of bs. NB: this returns both bs sorted and ms argsorted accordingly
-- i.e. it returns a tuple with two elements
argsort_only bs ms = snd $ argsort bs ms -- this returns only the argsorted
argsort_tuple list = argsort_only list1 list2 -- as above, but to be applied to a tuple
where list1 = fst(list)
list2 = snd(list)
cumsum x = scanl1 (+) x -- see http://bit.ly/cagkw2 NB: scanl1 needs a function of two arguments, similar to foldl
-- see scanl1 f [x1, x2, ...] == [x1, x1 `f` x2, ...]
count l = map (\l@(x:xs) -> (x,length l)) . group . sort $ l
count_occurrencies x = map length l
where l = group.sort $ x
unique x = nub x
save filename zs = writeFile filename (show zs)
save_vector filename list = writeFile filename $ unlines (map show list)
save_vector_flat filename list = writeFile filename $ unlines (map show $ concat list)