Skip to content

Commit

Permalink
Merge pull request #264 from fslaborg/#263-add-map-merge
Browse files Browse the repository at this point in the history
  • Loading branch information
bvenn authored Apr 27, 2023
2 parents 3d6a220 + 814ae18 commit 4f71652
Show file tree
Hide file tree
Showing 6 changed files with 271 additions and 12 deletions.
64 changes: 64 additions & 0 deletions docs/Distributions.fsx
Original file line number Diff line number Diff line change
Expand Up @@ -816,6 +816,70 @@ categoricalDistribution
categoricalDistribution |> GenericChart.toChartHTML
(***include-it-raw***)


(**
### Distribution merging
You can merge two distributions by using `Empirical.merge`, subroutines like `Empirical.add`, or the generic function `Empirical.mergeBy`.
Merging two distributions leads to a combined distribution. If keys are present in both distributions the value at `distA` is superseded with
the value at `distB`.
Please note, that when handling continuous data, the binning of both input distributions must be identical! When using categorical data,
the binning does not matter and the parameter can be set to `true`.
*)

let a =
[("k1",1);("k2",3)]
|> Map.ofList

let b =
[("k2",2);("k3",4)]
|> Map.ofList

let mergedDist = Empirical.merge true a b

(*** condition: ipynb ***)
#if IPYNB
mergedDist
#endif // IPYNB

(***hide***)
(sprintf "mergeDist = %A" mergedDist)
(***include-it-raw***)

(**
Adding two distributions leads to a combined distribution. If keys are present in both distributions the values at `distA` and `distB` are added.
*)

let addedDist = Empirical.add true a b

(*** condition: ipynb ***)
#if IPYNB
addedDist
#endif // IPYNB

(***hide***)
(sprintf "addedDist = %A" addedDist)
(***include-it-raw***)

(**
A custom merging function can be defined:
*)

let customDist = Empirical.mergeBy true (fun valueA valueB -> valueA * valueB) a b

(*** condition: ipynb ***)
#if IPYNB
customDist
#endif // IPYNB

(***hide***)
(sprintf "customDist = %A" customDist)
(***include-it-raw***)

(**
## Density estimation
*)
Expand Down
62 changes: 61 additions & 1 deletion src/FSharp.Stats/Distributions/Empirical.fs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
namespace FSharp.Stats.Distributions

open FSharp.Stats

/// Represents a probability mass function (map from values to probabilities).
module Empirical =
open System
Expand Down Expand Up @@ -183,15 +185,73 @@ module Empirical =
)
|> Map.ofSeq
|> normalize

/// <summary>Merges two maps into a single map. If a key exists in both maps, the value is determined by f with the first value being from mapA and the second originating from mapB.</summary>
/// <param name="equalBandwidthOrNominal">Is the binwidth equal for both distributions? For nominal data set to true.</param>
/// <param name="f">Function to transform values if key is present in both histograms. `histA-value &#8594; histB-value &#8594; newValue`</param>
/// <param name="mapA">Empirical distribution A</param>
/// <param name="mapB">Empirical distribution B</param>
/// <remarks>When applied to continuous data the bandwidths must be equal!</remarks>
/// <remarks>This function is not commutative! (mergeBy f a b) is not equal to (mergeBy f b a)</remarks>
/// <returns>New frequency map that results from merged maps mapA and mapB. Values from keys that are present in both maps are handled by f</returns>
let mergeBy equalBandwidthOrNominal (f: 'value -> 'value -> 'value) (histA: Map<_,'value>) (histB: Map<_,'value>) =
if equalBandwidthOrNominal then
Map.mergeBy f histA histB
else
failwithf "Not implemented yet. If continuous data shall be merged, bandwidth must be equal. This does not matter for nominal data!"
//ToDo:
// Dissect both distributions and construct a new one based on given bandwidths
// New bandwidth might be double the largest observed bandwidth to not miss-sort any data.

type EmpiricalDistribution() =
/// <summary>Merges two maps into a single map. If a key exists in both maps, the value in histA is superseded by the value in histB.</summary>
/// <param name="equalBandwidthOrNominal">Is the binwidth equal for both distributions? For nominal data set to true.</param>
/// <param name="histA">Empirical distribution A</param>
/// <param name="histB">Empirical distribution B</param>
/// <remarks>When applied to continuous data the bandwidths must be equal!</remarks>
/// <remarks>This function is not commutative! (merge a b) is not equal to (merge b a)</remarks>
/// <returns>New frequency map that results from merged maps histA and histB.</returns>
let merge equalBandwidthOrNominal (histA: Map<_,'value>) (histB: Map<_,'value>) =
mergeBy equalBandwidthOrNominal (fun a b -> b) histA histB

/// <summary>Merges two maps into a single map. If a key exists in both maps, the value from mapB is added to the value of mapA.</summary>
/// <param name="equalBandwidthOrNominal">Is the binwidth equal for both distributions? For nominal data set to true.</param>
/// <param name="histA">Empirical distribution A</param>
/// <param name="histB">Empirical distribution B</param>
/// <remarks>When applied to continuous data the bandwidths must be equal!</remarks>
/// <remarks>This function is not commutative! (add a b) is not equal to (add b a)</remarks>
/// <returns>New frequency map that results from merged maps histA and histB. Values from keys that are present in both maps are handled by f</returns>
let inline add equalBandwidthOrNominal (histA: Map<_,'value>) (histB: Map<_,'value>) =
mergeBy equalBandwidthOrNominal (fun a b -> a + b) histA histB


type EmpiricalDistribution() =

/// Creates probability mass function of the input sequence.
/// The bandwidth defines the width of the bins the numbers are sorted into.
/// Bin intervals are half open excluding the upper border: [lower,upper)
static member create(bandwidth: float) =
fun (data: seq<float>) ->
Empirical.create bandwidth data

///// <summary>Merges two maps into a single map. If a key exists in both maps, the value in histA is superseded by the value in histB.</summary>
///// <param name="histA">Empirical distribution A</param>
///// <param name="histB">Empirical distribution B</param>
///// <remarks>When applied to continuous data the bandwidths must be equal!</remarks>
///// <remarks>This function is not commutative! (merge a b) is not equal to (merge b a)</remarks>
///// <returns>New frequency map that results from merged maps histA and histB.</returns>
//static member merge: ((Map<_,float> -> Map<_,float> -> Map<_,float>)) =
// fun histA histB ->
// Empirical.merge histA histB

///// <summary>Merges two maps into a single map. If a key exists in both maps, the value from mapB is added to the value of mapA.</summary>
///// <param name="histA">Empirical distribution A</param>
///// <param name="histB">Empirical distribution B</param>
///// <remarks>When applied to continuous data the bandwidths must be equal!</remarks>
///// <remarks>This function is not commutative! (add a b) is not equal to (add b a)</remarks>
///// <returns>New frequency map that results from merged maps histA and histB. Values from keys that are present in both maps are handled by f</returns>
//static member add: ((Map<_,float> -> Map<_,float> -> Map<_,float>)) =
// fun histA histB ->
// Empirical.add histA histB

/// Creates probability mass function of the categories in the input sequence.
/// A template defines the search space to exclude certain elements or to include elements that are not in the input sequence.
Expand Down
62 changes: 52 additions & 10 deletions src/FSharp.Stats/Distributions/Frequency.fs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ namespace FSharp.Stats.Distributions

/// Represents a histogram (map from values to integer frequencies).
module Frequency =
open FSharp.Stats

/// Given the list [a,b,a,c,b,b], produce a map {a:2, b:3, c:1} which contains the count of each unique item in the list
let createGeneric list =
Expand Down Expand Up @@ -50,7 +51,7 @@ module Frequency =
if hist.ContainsKey(x) then
hist.[x]
else
0
0

/// Gets an unsorted sequence of frequencies
let frequencies (hist:Map<_,int>) =
Expand All @@ -60,18 +61,59 @@ module Frequency =
let isSubset (histA:Map<_,int>) (histB:Map<_,int>) =
let rec issubset (histA:list<float*int>) (histB:Map<float,int>) =
match histA with
| head::rest -> let k,v = head
let y = frequencyAt histB k
if v > y then false else issubset rest histB
| head::rest ->
let k,v = head
let y = frequencyAt histB k
if v > y then
false
else
issubset rest histB
| [] -> true
issubset (histA |> Map.toList) histB

///// Subtracts the values histogramA from histogramB
//let subtract (histA:Map<'a,int>) (histB:Map<'a,int>) =
// Map.merge histA histB (fun k (v, v') -> v - v')

////// Adds the values in histogramA to histogramB
//let add (histA:Map<'a,int>) (histB:Map<'a,int>) =
// Map.merge histA histB (fun k (v, v') -> v + v')
/// <summary>Merges two histograms into a single histogram. If a key exists in both maps, the value is determined by f with the first value being from mapA and the second originating from mapB.</summary>
/// <param name="equalBandwidthOrNominal">Is the binwidth equal for both frequencies? For nominal data set to true.</param>
/// <param name="f">Function to transform values if key is present in both histograms. `mapA-value &#8594; mapB-value &#8594; newValue`</param>
/// <param name="mapA">Frequency map A</param>
/// <param name="mapB">Frequency map B</param>
/// <remarks>When applied to continuous data the bandwidths must be equal!</remarks>
/// <remarks>This function is not commutative! (mergeBy f a b) is not equal to (mergeBy f b a)</remarks>
/// <returns>New frequency map that results from merged maps mapA and mapB. Values from keys that are present in both maps are handled by f</returns>
let mergeBy equalBandwidthOrNominal f (histA: Map<_,'value>) (histB: Map<_,'value>) =
if equalBandwidthOrNominal then
Map.mergeBy f histA histB
else
failwithf "Not implemented yet. If continuous data shall be merged, bandwidth must be equal. This does not matter for nominal data!"
//ToDo:
// Dissect both frequencies and construct a new one based on given bandwidths
// New bandwidth might be double the largest observed bandwidth to not miss-sort any data.

/// <summary>Merges two histograms into a single histogram. If a key exists in both histograms, the value in histA is superseded by the value in histB.</summary>
/// <param name="equalBandwidthOrNominal">Is the binwidth equal for both frequencies? For nominal data set to true.</param>
/// <param name="histA">Frequency map A</param>
/// <param name="histB">Frequency map B</param>
/// <remarks>When applied to continuous data the bandwidths must be equal!</remarks>
/// <remarks>This function is not commutative! (merge a b) is not equal to (merge b a)</remarks>
/// <returns>New frequency map that results from merged maps histA and histB.</returns>
let merge equalBandwidthOrNominal (histA: Map<_,'value>) (histB: Map<_,'value>) =
mergeBy equalBandwidthOrNominal (fun a b -> b) histA histB

/// <summary>Merges two histograms into a single histogram. If a key exists in both histograms, the value from histB is subtracted from the value of histA.</summary>
/// <param name="equalBandwidthOrNominal">Is the binwidth equal for both frequencies? For nominal data set to true.</param>
/// <param name="histA">Frequency map A</param>
/// <param name="histB">Frequency map B</param>
/// <remarks>When applied to continuous data the bandwidths must be equal!</remarks>
/// <remarks>This function is not commutative! (subtract a b) is not equal to (subtract b a)</remarks>
let inline subtract equalBandwidthOrNominal (histA: Map<_,'value>) (histB: Map<_,'value>) =
mergeBy equalBandwidthOrNominal (fun a b -> a - b) histA histB

/// <summary>Merges two histograms into a single histogram. If a key exists in both histograms, the value from histA is added to the value of histB.</summary>
/// <param name="equalBandwidthOrNominal">Is the binwidth equal for both frequencies? For nominal data set to true.</param>
/// <param name="histA">Frequency map A</param>
/// <param name="histB">Frequency map B</param>
/// <remarks>When applied to continuous data the bandwidths must be equal!</remarks>
/// <returns>New frequency map that results from merged maps histA and histB. Values from keys that are present in both maps are handled by f</returns>
let inline add equalBandwidthOrNominal (histA: Map<_,'value>) (histB: Map<_,'value>) =
mergeBy equalBandwidthOrNominal (fun a b -> a + b) histA histB

1 change: 1 addition & 0 deletions src/FSharp.Stats/FSharp.Stats.fsproj
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
<Compile Include="Seq.fs" />
<Compile Include="Array.fs" />
<Compile Include="List.fs" />
<Compile Include="Map.fs" />
<Compile Include="JaggedArray.fs" />
<Compile Include="Vector.fs" />
<Compile Include="RowVector.fs" />
Expand Down
50 changes: 50 additions & 0 deletions src/FSharp.Stats/Map.fs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
namespace FSharp.Stats


/// Module to strore specialised computations on maps
module Map =

/// <summary>Merges two maps into a single map. If a key exists in both maps, the value is determined by f with the first value being from mapA and the second originating from mapB.</summary>
/// <param name="f">Function to transform values if key is present in both maps. `mapA-value &#8594; mapB-value &#8594; newValue`</param>
/// <param name="mapA">Frequency map A</param>
/// <param name="mapB">Frequency map B</param>
/// <remarks>When applied to continuous data the bandwidths must be equal!</remarks>
/// <remarks>This function is not commutative! (mergeBy f a b) is not equal to (mergeBy f b a)</remarks>
/// <returns>New frequency map that results from merged maps mapA and mapB. Values from keys that are present in both maps are handled by f</returns>
let mergeBy (f: 'value -> 'value -> 'value) (mapA: Map<'key,'value>) (mapB:Map<'key,'value>) =
mapB
|> Map.fold (fun (s: Map<'key,'value>) kB vB ->
let tmp = Map.tryFind kB s
match tmp with
| Some x -> Map.change kB (fun vA -> Some (f x vB)) s
| None -> Map.add kB vB s
)
mapA

/// <summary>Merges two maps into a single map. If a key exists in both maps, the value in mapA is superseded by the value in mapB.</summary>
/// <param name="mapA">Frequency map A</param>
/// <param name="mapB">Frequency map B</param>
/// <remarks>When applied to continuous data the bandwidths must be equal!</remarks>
/// <remarks>This function is not commutative! (merge a b) is not equal to (merge b a)</remarks>
/// <returns>New frequency map that results from merged maps mapA and mapB.</returns>
let merge (mapA: Map<'key,'value>) (mapB: Map<'key,'value>) =
mergeBy (fun a b -> b) mapA mapB

/// <summary>Merges two maps into a single map. If a key exists in both maps, the value from mapB is subtracted from the value of mapA.</summary>
/// <param name="mapA">Frequency map A</param>
/// <param name="mapB">Frequency map B</param>
/// <remarks>When applied to continuous data the bandwidths must be equal!</remarks>
/// <remarks>This function is not commutative! (subtract a b) is not equal to (subtract b a)</remarks>
let inline mergeSubtract (mapA: Map<'key,'value>) (mapB: Map<'key,'value>) =
mergeBy (fun a b -> a - b) mapA mapB

/// <summary>Merges two maps into a single map. If a key exists in both maps, the value from mapB is added to the value of mapA.</summary>
/// <param name="mapA">Frequency map A</param>
/// <param name="mapB">Frequency map B</param>
/// <remarks>When applied to continuous data the bandwidths must be equal!</remarks>
/// <remarks>This function is not commutative! (add a b) is not equal to (add b a)</remarks>
/// <returns>New frequency map that results from merged maps mapA and mapB. Values from keys that are present in both maps are handled by f</returns>
let inline mergeAdd (mapA: Map<'key,'value>) (mapB: Map<'key,'value>) =
mergeBy (fun a b -> a + b) mapA mapB


44 changes: 43 additions & 1 deletion tests/FSharp.Stats.Tests/DistributionsEmpirical.fs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ open TestExtensions
[<Tests>]
let empiricalTests =


let mySmallAlphabet = "abcdefghijklmnopqrstuvwxyz" |> Set.ofSeq
let myAlphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" |> Set.ofSeq
let myAlphabetNum = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" |> Set.ofSeq
Expand Down Expand Up @@ -146,5 +145,48 @@ let empiricalTests =
"Empirical.createNominal leads to a wrong PMF map keys"
TestExtensions.sequenceEqual(Accuracy.high) expectedValues actualValues
"Empirical.createNominal leads to a wrong PMF map values"
let a =
[
(0.2,12)
(0.0,5)
(-0.1,3)
]
|> Map.ofList

let b =
[
(0.2,-10)
(0.3,2)
(0.0,0)
]
|> Map.ofList

testCase "add" <| fun () ->
let expectedKeys,expectedValues =
Map.ofSeq [|(-0.1,3);(0.0,5);(0.2,2);(0.3,2)|]
|> Map.toArray
|> Array.unzip
let actualKeys,actualValues =
Empirical.add true a b
|> Map.toArray
|> Array.unzip
Expect.equal expectedKeys actualKeys
"Empirical.add leads to a wrong distribution addition"
Expect.equal expectedValues actualValues
"Empirical.add leads to a wrong distribution addition"

testCase "merge" <| fun () ->
let expectedKeys,expectedValues =
Map.ofSeq [|(-0.1,3);(0.0,0);(0.2,-10);(0.3,2)|]
|> Map.toArray
|> Array.unzip
let actualKeys,actualValues =
Empirical.merge true a b
|> Map.toArray
|> Array.unzip
Expect.equal expectedKeys actualKeys
"Empirical.merge leads to a wrong distribution merge"
Expect.equal expectedValues actualValues
"Empirical.merge leads to a wrong distribution merge"
]

0 comments on commit 4f71652

Please sign in to comment.