Monday, April 6, 2009

F# - Converting sequences to lists on the fly

There was a question on StackOverflow about chunking sequences to arrays.  The poster wanted to have the input put into three-element arrays, so something like this:

> seq { for i in 1..10000 -> i} ;;
val it : seq<int> = seq [1; 2; 3; 4; ...]

would become:

> seq { for i in 1..10000 -> i} |> Seq.in_groups_of_n 3;;
val it : seq<int list>
= seq [[1; 2; 3]; [4; 5; 6]; [7; 8; 9]; [10; 11; 12]; ...]
> 

(I usually prefer to have my data in lists rather than arrays, so this isn't quite the answer they were looking for, but it's a trivial conversion from lists to arrays.)

My first solution was something more like iter:

  let in_lists_of_length_n n f (ie : seq<'a>) = 
    use e = ie.GetEnumerator()
    let rec next_with_acc acc =
      match e.MoveNext(), acc with
      | true, a when List.length a + 1 = n ->  
        f(List.rev (e.Current :: a))
        next_with_acc []
      | true, _ -> next_with_acc (e.Current :: acc)
      | false, _ ->
        f(List.rev acc)
      ()
    next_with_acc [] 
 
That's OK, but it's pretty limited.  If what you're doing fits into something like iter, it's good enough.

But what about just transforming the sequence into a new sequence?  That seems like a more general solution.  So:

module Seq =  
  let in_groups_of_n n (s: seq<'a>) =
    let rec in_groups_of_n_with_acc n acc (ie: IEnumerator<'a>) =
      seq {
        match ie.MoveNext(), acc with
        | true, a when List.length a + 1 = n -> 
          yield List.rev (ie.Current :: a)
          yield! in_groups_of_n_with_acc n [] ie
        | true, a -> yield! in_groups_of_n_with_acc n (ie.Current :: a) ie
        | false, a when List.length a > 0 -> yield List.rev acc
        | false, a -> ()
      }
    seq {
      yield! in_groups_of_n_with_acc n [] (s.GetEnumerator())
    }    

That's pretty good - it gets us to the point where we're returning a real sequence, so it's much more useful than the iter-style solution.

But why limit ourselves to lists-of-length-n?  Let's turn the accumulator into a function, so you can get any groups you want:

module Seq =  
  let grouped_by f (s: seq<'a>)=
    let rec grouped_by_with_acc (f: 'a -> 'a list -> 'a list option * 'a list) acc (ie: IEnumerator<'a>) =
      seq {
        if ie.MoveNext()
        then 
          let nextValue, leftovers = f ie.Current acc
          if nextValue.IsSome then yield nextValue.Value
          yield! grouped_by_with_acc f leftovers ie
        else
          if not acc.IsEmpty then yield acc
      }
    seq {
      yield! grouped_by_with_acc f [] (s.GetEnumerator())
    }    

This takes a function that accepts:
  • The next item in the sequence
  • Leftovers from the last run
And returns a tuple:
  • An optional value to yield
  • Leftovers
Here's how its used:

module TestGroupedBy =
  let GroupsOfN n newValue acc =
    let newList = newValue :: acc

    // If we have the right length, return
    // a Some as the first value.  That'll 
    // be yielded by the sequence.
    if List.length acc = n - 1
    then Some (List.rev newList), []
    // If we don't have the right length,
    // use None (so nothing will be yielded)
    else None, newList   
    
  // Note that we're going to pass a curried function - (GroupsOfN 3) is a function
  // taking two arguments, suitable for passing to grouped_by
  let resultWithInts = seq { for i in 1..100 -> i} |> Seq.grouped_by (GroupsOfN 3)
  
  printfn "int result %A" resultWithInts
  
  let resultWithChars = seq { for i in ['a'; 'b'; 'c'; 'd'; 'e'; 'f'] -> i} |> Seq.grouped_by (GroupsOfN 2)
  
  printfn "char result %A" resultWithChars  

The results are:

int result seq [[1; 2; 3]; [4; 5; 6]; [7; 8; 9]; [10; 11; 12]; ...]
char result seq [['a'; 'b']; ['c'; 'd']; ['e'; 'f']]

Let's use this with a more interesting grouping function. Say we want the colors of all the single cows in a list:

module TestGroupedByWithMatching =
  let Cows newValue acc =
    let newList = newValue :: acc
    match newList with
    | "cow" :: color :: "one" :: t -> Some [color], []
    | a :: b :: t -> None, [a; b]
    | l -> None, newList
    
  let someCows = seq { for i in ["one"; "red"; "cow"; "and"; "one"; "white"; "cow"; "and"; "three"; "green"; "cows"] -> i} |> Seq.grouped_by Cows
  printfn "some cows are %A" someCows
  
// This prints:  
// some cows are seq [["red"]; ["white"]; ["cows"; "green"]]
// > 

So there's a problem - at the end, we're yielding the accumulator.  That's what we wanted in the previous examples; we were looking for groups of three, plus the last group of 0, 1, or 2 items.  Here, though, if we don't get a match we don't want to see the leftovers.  Let's create a new method that takes another function that specifies what to do with the accumulator:

  let grouped_by_with_leftover_processing f (f2: 'a list -> list<'a> option) (s: seq<'a>)=
    let rec grouped_by_with_acc (f: 'a -> 'a list -> 'a list option * 'a list) acc (ie: IEnumerator<'a>) =
      seq {
        if ie.MoveNext()
        then 
          let nextValue, leftovers = f ie.Current acc
          if nextValue.IsSome then yield nextValue.Value
          yield! grouped_by_with_acc f leftovers ie
        else
          let rems = f2 acc
          if rems.IsSome then yield rems.Value
      }
    seq {
      yield! grouped_by_with_acc f [] (s.GetEnumerator())
    }    

Now we can specify a function to use for the leftovers.  This time we should get the right cows.

module TestGroupedByWithMatching =
  let Cows newValue acc =
    let newList = newValue :: acc
    match newList with
    | "cow" :: color :: "one" :: t -> Some [color], []
    | a :: b :: t -> None, [a; b]
    | l -> None, newList
  
  let IgnoreLeftovers f = None
  
  let someCows = seq { for i in ["one"; "red"; "cow"; "and"; "one"; "white"; "cow"; "and"; "three"; "green"; "cows"] -> i}
  let cowColors = someCows |> Seq.grouped_by_with_leftover_processing Cows IgnoreLeftovers
  printfn "cowColors are %A" cowColors
  
// This prints:  
// cowColors are seq [["red"]; ["white"]]

But we've got two separate functions (grouped_by_with_leftover_processing and grouped_by) that duplicate most of their code.  grouped_by is just grouped_by_with_leftover_processing so the final code is:

module Seq =  
  let grouped_by_with_leftover_processing f (f2: 'a list -> list<'a> option) (s: seq<'a>)=
    let rec grouped_by_with_acc (f: 'a -> 'a list -> 'a list option * 'a list) acc (ie: IEnumerator<'a>) =
      seq {
        if ie.MoveNext()
        then 
          let nextValue, leftovers = f ie.Current acc
          if nextValue.IsSome then yield nextValue.Value
          yield! grouped_by_with_acc f leftovers ie
        else
          let rems = f2 acc
          if rems.IsSome then yield rems.Value
      }
    seq {
      yield! grouped_by_with_acc f [] (s.GetEnumerator())
    }    

  let YieldReversedLeftovers (f: 'a list) = 
    if f.IsEmpty
    then None
    else Some (List.rev f)

  let grouped_by f s =
    grouped_by_with_leftover_processing f YieldReversedLeftovers s

  let group_by_length_n n s =
    let grouping_function newValue acc =
      let newList = newValue :: acc
      // If we have the right length, return
      // a Some as the first value.  That'll 
      // be yielded by the sequence.
      if List.length acc = n - 1
      then Some (List.rev newList), []
      // If we don't have the right length,
      // use None (so nothing will be yielded)
      else None, newList  
    grouped_by grouping_function s

No comments: