From 974a718d151973b0d97ee088879228a75754de67 Mon Sep 17 00:00:00 2001 From: Michael Chavinda Date: Thu, 12 Jun 2025 17:55:21 -0700 Subject: [PATCH] Some major changes for expressions. * Add interpreter for expression language. * Remove copius reflection and use type families. * Change some examples to use the new derive API * deprecate deriveFrom --- app/Main.hs | 3 +- dataframe.cabal | 2 + docs/coming_from_dplyr.md | 2 +- src/DataFrame.hs | 1 + src/DataFrame/Display/Terminal/Plot.hs | 4 +- src/DataFrame/Errors.hs | 4 +- src/DataFrame/Internal/Column.hs | 261 +++++++++++++++----- src/DataFrame/Internal/Expression.hs | 120 +++++++++ src/DataFrame/Internal/Function.hs | 30 +-- src/DataFrame/Internal/Types.hs | 10 +- src/DataFrame/Operations/Aggregation.hs | 3 +- src/DataFrame/Operations/Core.hs | 5 +- src/DataFrame/Operations/Statistics.hs | 2 +- src/DataFrame/Operations/Subset.hs | 3 +- src/DataFrame/Operations/Transformations.hs | 44 +--- tests/Operations/Derive.hs | 12 +- 16 files changed, 363 insertions(+), 143 deletions(-) create mode 100644 src/DataFrame/Internal/Expression.hs diff --git a/app/Main.hs b/app/Main.hs index a637d110..b8f15155 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -101,8 +101,7 @@ chipotle = do print $ D.take 10 f -- Create a total_price column that is quantity * item_price - let multiply (a :: Int) (b :: Double) = fromIntegral a * b - let withTotalPrice = D.deriveFrom (["quantity", "item_price"], D.func multiply) "total_price" f + let withTotalPrice = D.derive "total_price" (D.lift fromIntegral (D.col @Int "quantity") * D.col @Double"item_price") f -- sample a filtered subset of the dataframe putStrLn "Sample dataframe" diff --git a/dataframe.cabal b/dataframe.cabal index 57caced1..0904e9e5 100644 --- a/dataframe.cabal +++ b/dataframe.cabal @@ -24,6 +24,7 @@ source-repository head library exposed-modules: DataFrame other-modules: DataFrame.Internal.Types, + DataFrame.Internal.Expression, DataFrame.Internal.Function, DataFrame.Internal.Parsing, DataFrame.Internal.Column, @@ -60,6 +61,7 @@ executable dataframe main-is: Main.hs other-modules: DataFrame, DataFrame.Internal.Types, + DataFrame.Internal.Expression, DataFrame.Internal.Function, DataFrame.Internal.Parsing, DataFrame.Internal.Column, diff --git a/docs/coming_from_dplyr.md b/docs/coming_from_dplyr.md index 997e0421..c9b366b4 100644 --- a/docs/coming_from_dplyr.md +++ b/docs/coming_from_dplyr.md @@ -111,7 +111,7 @@ starwars |> D.selectRange ("name", "mass") -- mass and height are optionals so we combine them with -- Haskell's Applicative operators. - |> D.deriveFrom (["mass", "height"], D.func (\w h -> bmi <$> w <*> h)) "bmi" + |> D.derive "bmi" (lift2 (/) (lift fromIntegral (col @Int "mass")) (lift fromIntegral (col@ Int "height"))) |> D.take 10 ``` diff --git a/src/DataFrame.hs b/src/DataFrame.hs index 446adcbf..2d7cf5d7 100644 --- a/src/DataFrame.hs +++ b/src/DataFrame.hs @@ -5,6 +5,7 @@ module DataFrame where import DataFrame.Internal.Types as D +import DataFrame.Internal.Expression as D import DataFrame.Internal.Function as D import DataFrame.Internal.Parsing as D import DataFrame.Internal.Column as D diff --git a/src/DataFrame/Display/Terminal/Plot.hs b/src/DataFrame/Display/Terminal/Plot.hs index 1c87b31f..1f89c31d 100644 --- a/src/DataFrame/Display/Terminal/Plot.hs +++ b/src/DataFrame/Display/Terminal/Plot.hs @@ -5,6 +5,7 @@ {-# LANGUAGE GADTs #-} {-# LANGUAGE NumericUnderscores #-} {-# LANGUAGE TupleSections #-} +{-# LANGUAGE FlexibleContexts #-} module DataFrame.Display.Terminal.Plot where import qualified Data.List as L @@ -19,9 +20,8 @@ import Control.Monad ( forM_, forM ) import Data.Bifunctor ( first ) import Data.Char ( ord, chr ) import DataFrame.Display.Terminal.Colours -import DataFrame.Internal.Column (Column(..)) +import DataFrame.Internal.Column (Column(..), Columnable) import DataFrame.Internal.DataFrame (DataFrame(..)) -import DataFrame.Internal.Types (Columnable) import DataFrame.Operations.Core import Data.Maybe (fromMaybe) import Data.Typeable (Typeable) diff --git a/src/DataFrame/Errors.hs b/src/DataFrame/Errors.hs index eeea901d..c67cd273 100644 --- a/src/DataFrame/Errors.hs +++ b/src/DataFrame/Errors.hs @@ -22,8 +22,8 @@ data DataFrameException where -> T.Text -- ^ call point -> DataFrameException TypeMismatchException' :: forall a . (Typeable a) - => TypeRep a -- ^ expected type - -> String -- ^ given type + => TypeRep a -- ^ given type + -> String -- ^ expected type -> T.Text -- ^ column name -> T.Text -- ^ call point -> DataFrameException diff --git a/src/DataFrame/Internal/Column.hs b/src/DataFrame/Internal/Column.hs index e1a4d9bb..9a069f99 100644 --- a/src/DataFrame/Internal/Column.hs +++ b/src/DataFrame/Internal/Column.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE ExplicitNamespaces #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE InstanceSigs #-} @@ -10,6 +11,11 @@ {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE UndecidableInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE PolyKinds #-} +{-# LANGUAGE AllowAmbiguousTypes #-} +{-# LANGUAGE LambdaCase #-} module DataFrame.Internal.Column where import qualified Data.ByteString.Char8 as C @@ -29,14 +35,16 @@ import DataFrame.Internal.Types import DataFrame.Internal.Parsing import Data.Int import Data.Maybe +import Data.Proxy import Data.Text.Encoding (decodeUtf8Lenient) import Data.Type.Equality (type (:~:)(Refl), TestEquality (..)) -import Data.Typeable (Typeable) +import Data.Typeable (Typeable, cast) import Data.Word import Type.Reflection import Unsafe.Coerce (unsafeCoerce) import DataFrame.Errors import Control.Exception (throw) +import Data.Kind (Type, Constraint) -- | Our representation of a column is a GADT that can store data in either -- a vector with boxed elements or @@ -50,6 +58,12 @@ data Column where MutableBoxedColumn :: Columnable a => VBM.IOVector a -> Column MutableUnboxedColumn :: (Columnable a, VU.Unbox a) => VUM.IOVector a -> Column +data TypedColumn a where + TColumn :: Columnable a => Column -> TypedColumn a + +unwrapTypedColumn :: TypedColumn a -> Column +unwrapTypedColumn (TColumn value) = value + -- Functions about column metadata. isGrouped :: Column -> Bool isGrouped (GroupedBoxedColumn column) = True @@ -74,6 +88,9 @@ columnTypeString column = case column of GroupedUnboxedColumn (column :: VB.Vector a) -> show (typeRep @a) GroupedOptionalColumn (column :: VB.Vector a) -> show (typeRep @a) +instance (Show a) => Show (TypedColumn a) where + show (TColumn col) = show col + instance Show Column where show :: Column -> String show (BoxedColumn column) = show column @@ -112,47 +129,140 @@ instance Eq Column where Just Refl -> VB.map (L.sort . VG.toList) a == VB.map (L.sort . VG.toList) b (==) _ _ = False -class (Columnable a) => Columnify a where - -- | Converts a boxed vector to a column making sure to put - -- the vector into an appropriate column type by reflection on the - -- vector's type parameter. - toColumn' :: VB.Vector a -> Column - -instance (Columnable a) => Columnify (Maybe a) where - toColumn' = OptionalColumn - -instance (Columnable a) => Columnify (VB.Vector a) where - toColumn' = GroupedBoxedColumn - -instance (Columnable a, VU.Unbox a) => Columnify (VU.Vector a) where - toColumn' = GroupedUnboxedColumn - -instance {-# INCOHERENT #-} (Columnable a) => Columnify a where - toColumn' xs = case testEquality (typeRep @a) (typeRep @Int) of - Just Refl -> UnboxedColumn (VU.convert xs) - Nothing -> case testEquality (typeRep @a) (typeRep @Double) of - Just Refl -> UnboxedColumn (VU.convert xs) - Nothing -> case testEquality (typeRep @a) (typeRep @Float) of - Just Refl -> UnboxedColumn (VU.convert xs) - Nothing -> BoxedColumn xs - -class (Columnable a) => ColumnifyList a where - -- | Converts a boxed vector to a column making sure to put - -- the vector into an appropriate column type by reflection on the - -- vector's type parameter. - toColumn :: [a] -> Column - -instance (Columnable a) => ColumnifyList (Maybe a) where - toColumn = OptionalColumn . VB.fromList - -instance {-# INCOHERENT #-} (Columnable a) => ColumnifyList a where - toColumn xs = case testEquality (typeRep @a) (typeRep @Int) of - Just Refl -> UnboxedColumn (VU.fromList xs) - Nothing -> case testEquality (typeRep @a) (typeRep @Double) of - Just Refl -> UnboxedColumn (VU.fromList xs) - Nothing -> case testEquality (typeRep @a) (typeRep @Float) of - Just Refl -> UnboxedColumn (VU.fromList xs) - Nothing -> BoxedColumn (VB.fromList xs) +data Rep + = RBoxed + | RUnboxed + | ROptional + | RGBoxed + | RGUnboxed + | RGOptional + +type family If (cond :: Bool) (yes :: k) (no :: k) :: k where + If 'True yes _ = yes + If 'False _ no = no + +type family Unboxable (a :: Type) :: Bool where + Unboxable Int = 'True + Unboxable Int8 = 'True + Unboxable Int16 = 'True + Unboxable Int32 = 'True + Unboxable Int64 = 'True + Unboxable Word = 'True + Unboxable Word8 = 'True + Unboxable Word16 = 'True + Unboxable Word32 = 'True + Unboxable Word64 = 'True + Unboxable Char = 'True + Unboxable Bool = 'True + Unboxable Double = 'True + Unboxable Float = 'True + Unboxable _ = 'False + +-- | Compute the column representation tag for any ā€˜a’. +type family KindOf a :: Rep where + KindOf (Maybe a) = 'ROptional + KindOf (VB.Vector a) = 'RGBoxed + KindOf (VU.Vector a) = 'RGUnboxed + KindOf a = If (Unboxable a) 'RUnboxed 'RBoxed + +class ColumnifyRep (r :: Rep) a where + toColumnRep :: VB.Vector a -> Column + +type Columnable a = (Columnable' a, ColumnifyRep (KindOf a) a, UnboxIf a, SBoolI (Unboxable a) ) + +instance (Columnable a, VU.Unbox a) + => ColumnifyRep 'RUnboxed a where + toColumnRep = UnboxedColumn . VU.convert + +instance Columnable a + => ColumnifyRep 'RBoxed a where + toColumnRep = BoxedColumn + +instance Columnable a + => ColumnifyRep 'ROptional (Maybe a) where + toColumnRep = OptionalColumn + +instance Columnable a + => ColumnifyRep 'RGBoxed (VB.Vector a) where + toColumnRep = GroupedBoxedColumn + +instance (Columnable a, VU.Unbox a) + => ColumnifyRep 'RGUnboxed (VU.Vector a) where + toColumnRep = GroupedUnboxedColumn + +toColumn' :: + forall a. (Columnable a, ColumnifyRep (KindOf a) a) + => VB.Vector a -> Column +toColumn' = toColumnRep @(KindOf a) + +toColumn :: + forall a. (Columnable a, ColumnifyRep (KindOf a) a) + => [a] -> Column +toColumn = toColumnRep @(KindOf a) . VB.fromList + +data SBool (b :: Bool) where + STrue :: SBool 'True + SFalse :: SBool 'False + +class SBoolI (b :: Bool) where + sbool :: SBool b -- the run-time witness + +instance SBoolI 'True where sbool = STrue +instance SBoolI 'False where sbool = SFalse + +sUnbox :: forall a. SBoolI (Unboxable a) => SBool (Unboxable a) +sUnbox = sbool @(Unboxable a) + +type family When (flag :: Bool) (c :: Constraint) :: Constraint where + When 'True c = c + When 'False c = () -- empty constraint + +type UnboxIf a = When (Unboxable a) (VU.Unbox a) + +-- | Generic column transformation (no index). +transform + :: forall b c. -- element types + ( Columnable b + , Columnable c + , UnboxIf c -- only required when ā€˜c’ is unboxable + , Typeable b + , Typeable c ) + => (b -> c) + -> Column + -> Maybe Column +transform f = \case + + BoxedColumn (col :: VB.Vector a) + | Just Refl <- testEquality (typeRep @a) (typeRep @b) + -> Just (toColumn' @c (VB.map f col)) + | otherwise -> Nothing + + OptionalColumn (col :: VB.Vector a) + | Just Refl <- testEquality (typeRep @a) (typeRep @b) + -> Just (toColumn' @c (VB.map f col)) + | otherwise -> Nothing + + UnboxedColumn (col :: VU.Vector a) + | Just Refl <- testEquality (typeRep @a) (typeRep @b) + -> Just $ case sUnbox @c of + STrue -> UnboxedColumn (VU.map f col) -- needs VU.Unbox c + SFalse -> toColumn' @c (VB.map f (VB.convert col)) + | otherwise -> Nothing + + GroupedBoxedColumn (col :: VB.Vector (VB.Vector a)) + | Just Refl <- testEquality (typeRep @(VB.Vector a)) (typeRep @b) + -> Just (toColumn' @c (VB.map f col)) + | otherwise -> Nothing + + GroupedUnboxedColumn (col :: VB.Vector (VU.Vector a)) + | Just Refl <- testEquality (typeRep @(VU.Vector a)) (typeRep @b) + -> Just (toColumn' @c (VB.map f col)) + | otherwise -> Nothing + + GroupedOptionalColumn (col :: VB.Vector (VB.Vector a)) + | Just Refl <- testEquality (typeRep @(VB.Vector a)) (typeRep @b) + -> Just (toColumn' @c (VB.map f col)) + | otherwise -> Nothing -- | Converts a an unboxed vector to a column making sure to put -- the vector into an appropriate column type by reflection on the @@ -257,33 +367,33 @@ sortedIndexes asc (GroupedOptionalColumn column) = runST $ do -- Operations on a column that may change its type. -instance Transformable Column where - transform :: forall b c . (Columnable b, Columnable c) => (b -> c) -> Column -> Maybe Column - transform f (BoxedColumn (column :: VB.Vector a)) = do - Refl <- testEquality (typeRep @a) (typeRep @b) - return (toColumn' (VB.map f column)) - transform f (OptionalColumn (column :: VB.Vector a)) = do - Refl <- testEquality (typeRep @a) (typeRep @b) - return (toColumn' (VB.map f column)) - transform f (UnboxedColumn (column :: VU.Vector a)) = do - Refl <- testEquality (typeRep @a) (typeRep @b) - return $ if testUnboxable (typeRep @c) then transformUnboxed f column else toColumn' (VB.map f (VB.convert column)) - transform f (GroupedBoxedColumn (column :: VB.Vector (VB.Vector a))) = do - Refl <- testEquality (typeRep @(VB.Vector a)) (typeRep @b) - return (toColumn' (VB.map f column)) - transform f (GroupedUnboxedColumn (column :: VB.Vector (VU.Vector a))) = do - Refl <- testEquality (typeRep @(VU.Vector a)) (typeRep @b) - return (toColumn' (VB.map f column)) - transform f (GroupedOptionalColumn (column :: VB.Vector (VB.Vector a))) = do - Refl <- testEquality (typeRep @(VB.Vector a)) (typeRep @b) - return (toColumn' (VB.map f column)) +-- instance Transformable Column where +-- transform :: forall b c . (Columnable b, ColumnifyRep (KindOf b) b, Columnable c, ColumnifyRep (KindOf c) c) => (b -> c) -> Column -> Maybe Column +-- transform f (BoxedColumn (column :: VB.Vector a)) = do +-- Refl <- testEquality (typeRep @a) (typeRep @b) +-- return (toColumn' @c (VB.map f column)) +-- transform f (OptionalColumn (column :: VB.Vector a)) = do +-- Refl <- testEquality (typeRep @a) (typeRep @b) +-- return (toColumn' @c (VB.map f column)) +-- transform f (UnboxedColumn (column :: VU.Vector a)) = do +-- Refl <- testEquality (typeRep @a) (typeRep @b) +-- return $ if testUnboxable (typeRep @c) then transformUnboxed f column else toColumn' (VB.map f (VB.convert column)) +-- transform f (GroupedBoxedColumn (column :: VB.Vector (VB.Vector a))) = do +-- Refl <- testEquality (typeRep @(VB.Vector a)) (typeRep @b) +-- return (toColumn' @c (VB.map f column)) +-- transform f (GroupedUnboxedColumn (column :: VB.Vector (VU.Vector a))) = do +-- Refl <- testEquality (typeRep @(VU.Vector a)) (typeRep @b) +-- return (toColumn' @c (VB.map f column)) +-- transform f (GroupedOptionalColumn (column :: VB.Vector (VB.Vector a))) = do +-- Refl <- testEquality (typeRep @(VB.Vector a)) (typeRep @b) +-- return (toColumn' @c (VB.map f column)) -- | Applies a function that returns an unboxed result to an unboxed vector, storing the result in a column. -transformUnboxed :: forall a b . (Columnable a, VU.Unbox a, Columnable b) => (a -> b) -> VU.Vector a -> Column +transformUnboxed :: forall a b . (Columnable a, ColumnifyRep (KindOf a) a, VU.Unbox a, Columnable b, ColumnifyRep (KindOf b) b) => (a -> b) -> VU.Vector a -> Column transformUnboxed f = itransformUnboxed (const f) -- TODO: Make a type class with incoherent instances. -itransformUnboxed :: forall a b . (Columnable a, VU.Unbox a, Columnable b) => (Int -> a -> b) -> VU.Vector a -> Column +itransformUnboxed :: forall a b . (Columnable a, ColumnifyRep (KindOf a) a, VU.Unbox a, Columnable b, ColumnifyRep (KindOf b) b) => (Int -> a -> b) -> VU.Vector a -> Column itransformUnboxed f column = case testEquality (typeRep @b) (typeRep @Int) of Just Refl -> UnboxedColumn $ VU.imap f column Nothing -> case testEquality (typeRep @b) (typeRep @Int8) of @@ -314,8 +424,8 @@ itransformUnboxed f column = case testEquality (typeRep @b) (typeRep @Int) of Just Refl -> UnboxedColumn $ VU.imap f column Nothing -> error "Result type is unboxed" -- since we only call this after confirming --- | tranform with index. -itransform :: forall b c. (Columnable b, Columnable c) => (Int -> b -> c) -> Column -> Maybe Column +-- | transform with index. +itransform :: forall b c. (Columnable b, ColumnifyRep (KindOf b) b, Columnable b, Columnable c, ColumnifyRep (KindOf c) c) => (Int -> b -> c) -> Column -> Maybe Column itransform f (BoxedColumn (column :: VB.Vector a)) = do Refl <- testEquality (typeRep @a) (typeRep @b) return (toColumn' (VB.imap f column)) @@ -430,6 +540,25 @@ zipColumns (UnboxedColumn column) (BoxedColumn other) = BoxedColumn (VB.generate zipColumns (UnboxedColumn column) (UnboxedColumn other) = UnboxedColumn (VG.zip column other) {-# INLINE zipColumns #-} +zipWithColumns :: forall a b c . (Columnable a, ColumnifyRep (KindOf a) a, Columnable b, ColumnifyRep (KindOf b) b, Columnable c, ColumnifyRep (KindOf c) c) => (a -> b -> c) -> Column -> Column -> Column +zipWithColumns f (BoxedColumn (column :: VB.Vector d)) (BoxedColumn (other :: VB.Vector e)) = case testEquality (typeRep @a) (typeRep @d) of + Just Refl -> case testEquality (typeRep @b) (typeRep @e) of + Just Refl -> toColumn' (VG.zipWith f column other) + Nothing -> throw $ TypeMismatchException' (typeRep @b) (show $ typeRep @e) "" "zipWithColumns" + Nothing -> throw $ TypeMismatchException' (typeRep @a) (show $ typeRep @d) "" "zipWithColumns" +zipWithColumns f (BoxedColumn (column :: VB.Vector d)) (UnboxedColumn (other :: VU.Vector e)) = case testEquality (typeRep @a) (typeRep @d) of + Just Refl -> case testEquality (typeRep @b) (typeRep @e) of + Just Refl -> toColumn' (VG.zipWith f column (VB.convert other)) + Nothing -> throw $ TypeMismatchException' (typeRep @b) (show $ typeRep @e) "" "zipWithColumns" + Nothing -> throw $ TypeMismatchException' (typeRep @a) (show $ typeRep @d) "" "zipWithColumns" +zipWithColumns f left@(UnboxedColumn (column :: VU.Vector d)) right@(BoxedColumn (other :: VB.Vector e)) = zipWithColumns f right left +zipWithColumns f (UnboxedColumn (column :: VU.Vector d)) (UnboxedColumn (other :: VU.Vector e)) = case testEquality (typeRep @a) (typeRep @d) of + Just Refl -> case testEquality (typeRep @b) (typeRep @e) of + Just Refl -> toColumn' $ VB.zipWith f (VG.convert column) (VG.convert other) + Nothing -> throw $ TypeMismatchException' (typeRep @b) (show $ typeRep @e) "" "zipWithColumns" + Nothing -> throw $ TypeMismatchException' (typeRep @a) (show $ typeRep @d) "" "zipWithColumns" +{-# INLINE zipWithColumns #-} + -- Functions for mutable columns (intended for IO). -- Clean this up. writeColumn :: Int -> T.Text -> Column -> IO (Either T.Text Bool) diff --git a/src/DataFrame/Internal/Expression.hs b/src/DataFrame/Internal/Expression.hs new file mode 100644 index 00000000..9ff6e034 --- /dev/null +++ b/src/DataFrame/Internal/Expression.hs @@ -0,0 +1,120 @@ +{-# LANGUAGE ExplicitNamespaces #-} +{-# LANGUAGE GADTs #-} +{-# LANGUAGE InstanceSigs #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RankNTypes #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE StrictData #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE UndecidableInstances #-} +{-# LANGUAGE MultiParamTypeClasses #-} +module DataFrame.Internal.Expression where + +import qualified Data.Map as M +import Data.Type.Equality (type (:~:)(Refl), TestEquality (testEquality)) +import Data.Data (Typeable) +import DataFrame.Internal.Column +import DataFrame.Internal.DataFrame +import DataFrame.Internal.Types +import qualified Data.Text as T +import qualified Data.Vector as V +import qualified Data.Vector.Unboxed as VU +import Type.Reflection (typeRep) +import DataFrame.Errors (DataFrameException(ColumnNotFoundException)) +import Control.Exception (throw) +import Data.Maybe (fromMaybe) + +data Expr a where + Col :: Columnable a => T.Text -> Expr a + Lit :: Columnable a => a -> Expr a + Apply :: (Columnable a, ColumnifyRep (KindOf a) a, Columnable b, ColumnifyRep (KindOf b) b) => (b -> a) -> Expr b -> Expr a + BinOp :: (Columnable c, ColumnifyRep (KindOf c) c, Columnable b, ColumnifyRep (KindOf b) b, Columnable a, ColumnifyRep (KindOf a) a) => (c -> b -> a) -> Expr c -> Expr b -> Expr a + +interpret :: forall a b . (Columnable a) => DataFrame -> Expr a -> TypedColumn a +interpret df (Lit value) = TColumn $ toColumn' $ V.replicate (fst $ dataframeDimensions df) value +interpret df (Col name) = case getColumn name df of + Nothing -> throw $ ColumnNotFoundException name "" (map fst $ M.toList $ columnIndices df) + Just col -> TColumn col +interpret df (Apply (f :: c -> d) value) = let + (TColumn value') = interpret @c df value + in TColumn $ fromMaybe (error "transform returned nothing") (transform f value') +interpret df (BinOp (f :: c -> d -> e) left right) = let + (TColumn left') = interpret @c df left + (TColumn right') = interpret @d df right + in TColumn $ zipWithColumns f left' right' + +instance (Num a, Columnable a, ColumnifyRep (KindOf a) a) => Num (Expr a) where + (+) :: Expr a -> Expr a -> Expr a + (+) = BinOp (+) + + (*) :: Expr a -> Expr a -> Expr a + (*) = BinOp (*) + + fromInteger :: Integer -> Expr a + fromInteger = Lit . fromInteger + + negate :: Expr a -> Expr a + negate = Apply negate + + abs :: Num a => Expr a -> Expr a + abs = Apply abs + + signum :: Num a => Expr a -> Expr a + signum = Apply signum + +instance (Fractional a, Columnable a, ColumnifyRep (KindOf a) a) => Fractional (Expr a) where + fromRational :: (Fractional a, Columnable a) => Rational -> Expr a + fromRational = Lit . fromRational + + (/) :: (Fractional a, Columnable a) => Expr a -> Expr a -> Expr a + (/) = BinOp (/) + +instance (Floating a, Columnable a, ColumnifyRep (KindOf a) a) => Floating (Expr a) where + pi :: (Floating a, Columnable a) => Expr a + pi = Lit pi + exp :: (Floating a, Columnable a) => Expr a -> Expr a + exp = Apply exp + log :: (Floating a, Columnable a) => Expr a -> Expr a + log = Apply log + sin :: (Floating a, Columnable a) => Expr a -> Expr a + sin = Apply sin + cos :: (Floating a, Columnable a) => Expr a -> Expr a + cos = Apply cos + asin :: (Floating a, Columnable a) => Expr a -> Expr a + asin = Apply asin + acos :: (Floating a, Columnable a) => Expr a -> Expr a + acos = Apply acos + atan :: (Floating a, Columnable a) => Expr a -> Expr a + atan = Apply atan + sinh :: (Floating a, Columnable a) => Expr a -> Expr a + sinh = Apply sinh + cosh :: (Floating a, Columnable a) => Expr a -> Expr a + cosh = Apply cosh + asinh :: (Floating a, Columnable a) => Expr a -> Expr a + asinh = Apply sinh + acosh :: (Floating a, Columnable a) => Expr a -> Expr a + acosh = Apply acosh + atanh :: (Floating a, Columnable a) => Expr a -> Expr a + atanh = Apply atanh + + +instance (Show a) => Show (Expr a) where + show :: Show a => Expr a -> String + show (Col name) = "col(" ++ T.unpack name ++ ")" + show (Lit value) = show value + show (Apply f value) = "apply(" ++ show value ++ ")" + show (BinOp f a b) = "binop(" ++ show a ++ ", " ++ show b ++ ")" + +col :: (Columnable a, ColumnifyRep (KindOf a) a) => T.Text -> Expr a +col = Col + +lit :: (Columnable a, ColumnifyRep (KindOf a) a) => a -> Expr a +lit = Lit + +lift :: (Columnable a, ColumnifyRep (KindOf a) a, Columnable b, ColumnifyRep (KindOf b) b) => (a -> b) -> Expr a -> Expr b +lift = Apply + +lift2 :: (ColumnifyRep (KindOf c) c, Columnable c, ColumnifyRep (KindOf b) b, Columnable b, ColumnifyRep (KindOf a) a, Columnable a) => (c -> b -> a) -> Expr c -> Expr b -> Expr a +lift2 = BinOp diff --git a/src/DataFrame/Internal/Function.hs b/src/DataFrame/Internal/Function.hs index 04d1ee78..302c6bab 100644 --- a/src/DataFrame/Internal/Function.hs +++ b/src/DataFrame/Internal/Function.hs @@ -21,34 +21,34 @@ import Type.Reflection (typeRep, typeOf) -- A GADT to wrap functions so we can have hetegeneous lists of functions. data Function where - F1 :: forall a b . (Columnable a, Columnable b) => (a -> b) -> Function - F2 :: forall a b c . (Columnable a, Columnable b, Columnable c) => (a -> b -> c) -> Function - F3 :: forall a b c d . (Columnable a, Columnable b, Columnable c, Columnable d) => (a -> b -> c -> d) -> Function - F4 :: forall a b c d e . (Columnable a, Columnable b, Columnable c, Columnable d, Columnable e) => (a -> b -> c -> d -> e) -> Function - Cond :: forall a . (Columnable a) => (a -> Bool) -> Function - ICond :: forall a . (Columnable a) => (Int -> a -> Bool) -> Function + F1 :: forall a b . (Columnable' a, Columnable' b) => (a -> b) -> Function + F2 :: forall a b c . (Columnable' a, Columnable' b, Columnable' c) => (a -> b -> c) -> Function + F3 :: forall a b c d . (Columnable' a, Columnable' b, Columnable' c, Columnable' d) => (a -> b -> c -> d) -> Function + F4 :: forall a b c d e . (Columnable' a, Columnable' b, Columnable' c, Columnable' d, Columnable' e) => (a -> b -> c -> d -> e) -> Function + Cond :: forall a . (Columnable' a) => (a -> Bool) -> Function + ICond :: forall a . (Columnable' a) => (Int -> a -> Bool) -> Function -- Helper class to do the actual wrapping class WrapFunction a where wrapFunction :: a -> Function -- Instance for 1-argument functions -instance (Columnable a, Columnable b) => WrapFunction (a -> b) where - wrapFunction :: (Columnable a, Columnable b) => (a -> b) -> Function +instance (Columnable' a, Columnable' b) => WrapFunction (a -> b) where + wrapFunction :: (Columnable' a, Columnable' b) => (a -> b) -> Function wrapFunction = F1 -- Instance for 2-argument functions -instance {-# INCOHERENT #-} (Columnable a, Columnable b, Columnable c) => WrapFunction (a -> b -> c) where - wrapFunction :: (Columnable a, Columnable b, Columnable c) => (a -> b -> c) -> Function +instance {-# INCOHERENT #-} (Columnable' a, Columnable' b, Columnable' c) => WrapFunction (a -> b -> c) where + wrapFunction :: (Columnable' a, Columnable' b, Columnable' c) => (a -> b -> c) -> Function wrapFunction = F2 -- Instance for 3-argument functions -instance {-# INCOHERENT #-} (Columnable a, Columnable b, Columnable c, Columnable d) => WrapFunction (a -> b -> c -> d) where - wrapFunction :: (Columnable a, Columnable b, Columnable c, Columnable d) => (a -> b -> c -> d) -> Function +instance {-# INCOHERENT #-} (Columnable' a, Columnable' b, Columnable' c, Columnable' d) => WrapFunction (a -> b -> c -> d) where + wrapFunction :: (Columnable' a, Columnable' b, Columnable' c, Columnable' d) => (a -> b -> c -> d) -> Function wrapFunction = F3 -instance {-# INCOHERENT #-} (Columnable a, Columnable b, Columnable c, Columnable d, Columnable e) => WrapFunction (a -> b -> c -> d -> e) where - wrapFunction :: (Columnable a, Columnable b, Columnable c, Columnable d, Columnable e) => (a -> b -> c -> d -> e) -> Function +instance {-# INCOHERENT #-} (Columnable' a, Columnable' b, Columnable' c, Columnable' d, Columnable' e) => WrapFunction (a -> b -> c -> d -> e) where + wrapFunction :: (Columnable' a, Columnable' b, Columnable' c, Columnable' d, Columnable' e) => (a -> b -> c -> d -> e) -> Function wrapFunction = F4 -- The main function that wraps arbitrary functions @@ -65,7 +65,7 @@ uncons v = Just (V.unsafeHead v, V.unsafeTail v) pattern (:<|) :: a -> V.Vector a -> V.Vector a pattern x :<| xs <- (uncons -> Just (x, xs)) -funcApply :: forall c . (Columnable c) => V.Vector RowValue -> Function -> c +funcApply :: forall c . (Columnable' c) => V.Vector RowValue -> Function -> c funcApply Empty _ = error "Empty args" funcApply (Value (x :: a') :<| Empty) (F1 (f :: (a -> b))) = case testEquality (typeRep @a') (typeRep @a) of Just Refl -> case testEquality (typeOf (f x)) (typeRep @c) of diff --git a/src/DataFrame/Internal/Types.hs b/src/DataFrame/Internal/Types.hs index 99a11d1e..0623dff4 100644 --- a/src/DataFrame/Internal/Types.hs +++ b/src/DataFrame/Internal/Types.hs @@ -21,10 +21,10 @@ import Data.Type.Equality (TestEquality(..)) -- We need an "Object" type as an intermediate representation -- for rows. Useful for things like sorting and function application. -type Columnable a = (Typeable a, Show a, Ord a, Eq a) +type Columnable' a = (Typeable a, Show a, Ord a, Eq a) data RowValue where - Value :: (Columnable a) => a -> RowValue + Value :: (Columnable' a) => a -> RowValue instance Eq RowValue where (==) :: RowValue -> RowValue -> Bool @@ -42,13 +42,9 @@ instance Show RowValue where show :: RowValue -> String show (Value a) = show a -toRowValue :: forall a . (Columnable a) => a -> RowValue +toRowValue :: forall a . (Columnable' a) => a -> RowValue toRowValue = Value --- | Essentially a "functor" instance of our type-erased Column. -class Transformable a where - transform :: forall b c . (Columnable b, Columnable c) => (b -> c) -> a -> Maybe a - -- Convenience functions for types. unboxableTypes :: TypeRepList '[Int, Int8, Int16, Int32, Int64, Word, Word8, Word16, Word32, Word64, diff --git a/src/DataFrame/Operations/Aggregation.hs b/src/DataFrame/Operations/Aggregation.hs index 86d30295..7d441519 100644 --- a/src/DataFrame/Operations/Aggregation.hs +++ b/src/DataFrame/Operations/Aggregation.hs @@ -4,6 +4,7 @@ {-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeApplications #-} +{-# LANGUAGE FlexibleContexts #-} module DataFrame.Operations.Aggregation where import qualified Data.Set as S @@ -22,7 +23,7 @@ import qualified Statistics.Sample as SS import Control.Exception (throw) import Control.Monad (foldM_) import Control.Monad.ST (runST) -import DataFrame.Internal.Column (Column(..), toColumn', getIndicesUnboxed, getIndices) +import DataFrame.Internal.Column (Column(..), toColumn', getIndicesUnboxed, getIndices, Columnable) import DataFrame.Internal.DataFrame (DataFrame(..), empty, getColumn) import DataFrame.Internal.Parsing import DataFrame.Internal.Types diff --git a/src/DataFrame/Operations/Core.hs b/src/DataFrame/Operations/Core.hs index ff3a88a6..cf08b602 100644 --- a/src/DataFrame/Operations/Core.hs +++ b/src/DataFrame/Operations/Core.hs @@ -19,10 +19,9 @@ import qualified Data.Vector.Unboxed as VU import Control.Exception ( throw ) import DataFrame.Errors -import DataFrame.Internal.Column ( Column(..), toColumn', toColumn, columnLength, columnTypeString, expandColumn ) +import DataFrame.Internal.Column ( Column(..), toColumn', toColumn, columnLength, columnTypeString, expandColumn, Columnable) import DataFrame.Internal.DataFrame (DataFrame(..), getColumn, null, empty) import DataFrame.Internal.Parsing (isNullish) -import DataFrame.Internal.Types (Columnable) import Data.Either import Data.Function (on, (&)) import Data.Maybe @@ -43,7 +42,7 @@ columnNames = map fst . L.sortBy (compare `on` snd). M.toList . columnIndices -- | /O(n)/ Adds a vector to the dataframe. insertColumn :: forall a. - (Columnable a) => + Columnable a => -- | Column Name T.Text -> -- | Vector to add to column diff --git a/src/DataFrame/Operations/Statistics.hs b/src/DataFrame/Operations/Statistics.hs index 1759e724..4804e960 100644 --- a/src/DataFrame/Operations/Statistics.hs +++ b/src/DataFrame/Operations/Statistics.hs @@ -5,6 +5,7 @@ {-# LANGUAGE GADTs #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE StrictData #-} +{-# LANGUAGE FlexibleContexts #-} module DataFrame.Operations.Statistics where import qualified Data.List as L @@ -21,7 +22,6 @@ import Control.Exception (throw) import DataFrame.Errors (DataFrameException(..)) import DataFrame.Internal.Column import DataFrame.Internal.DataFrame (DataFrame(..), getColumn, empty) -import DataFrame.Internal.Types (Columnable, transform) import DataFrame.Operations.Core import Data.Foldable (asum) import Data.Maybe (isJust, fromMaybe) diff --git a/src/DataFrame/Operations/Subset.hs b/src/DataFrame/Operations/Subset.hs index 849f6d3c..c8d32ca5 100644 --- a/src/DataFrame/Operations/Subset.hs +++ b/src/DataFrame/Operations/Subset.hs @@ -4,6 +4,7 @@ {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE GADTs #-} +{-# LANGUAGE FlexibleContexts #-} module DataFrame.Operations.Subset where import qualified Data.List as L @@ -21,7 +22,7 @@ import DataFrame.Internal.Column import DataFrame.Internal.DataFrame (DataFrame(..), getColumn, empty) import DataFrame.Internal.Function import DataFrame.Internal.Row (mkRowFromArgs) -import DataFrame.Internal.Types (Columnable, RowValue, toRowValue) +import DataFrame.Internal.Types (RowValue, toRowValue) import DataFrame.Operations.Core import DataFrame.Operations.Transformations (apply) import Data.Function ((&)) diff --git a/src/DataFrame/Operations/Transformations.hs b/src/DataFrame/Operations/Transformations.hs index 82c27f48..12f55826 100644 --- a/src/DataFrame/Operations/Transformations.hs +++ b/src/DataFrame/Operations/Transformations.hs @@ -2,6 +2,7 @@ {-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeApplications #-} +{-# LANGUAGE FlexibleContexts #-} module DataFrame.Operations.Transformations where import qualified Data.List as L @@ -13,11 +14,12 @@ import qualified Data.Vector.Unboxed as VU import Control.Exception (throw) import DataFrame.Errors (DataFrameException(..)) -import DataFrame.Internal.Column (Column(..), columnTypeString, itransform, ifoldrColumn) +import DataFrame.Internal.Column (Column(..), columnTypeString, itransform, ifoldrColumn, TypedColumn (TColumn), Columnable, transform, unwrapTypedColumn) import DataFrame.Internal.DataFrame (DataFrame(..), getColumn) +import DataFrame.Internal.Expression import DataFrame.Internal.Function (Function(..), funcApply) import DataFrame.Internal.Row (mkRowFromArgs) -import DataFrame.Internal.Types (Columnable, RowValue, toRowValue, transform) +import DataFrame.Internal.Types (RowValue, toRowValue) import DataFrame.Operations.Core import Data.Maybe import Type.Reflection (typeRep, typeOf) @@ -41,41 +43,11 @@ apply f columnName d = case getColumn columnName d of -- | O(k) Apply a function to a combination of columns in a dataframe and -- add the result into `alias` column. -deriveFrom :: ([T.Text], Function) -> T.Text -> DataFrame -> DataFrame -deriveFrom (args, f) name df = case f of - (F4 (f' :: a -> b -> c -> d -> e)) -> let - xs = VG.map (\row -> funcApply @e row f) $ V.generate (fst (dimensions df)) (mkRowFromArgs args df) - in insertColumn name xs df - (F3 (f' :: a -> b -> c -> d)) -> let - xs = VG.map (\row -> funcApply @d row f) $ V.generate (fst (dimensions df)) (mkRowFromArgs args df) - in insertColumn name xs df - (F2 (f' :: a -> b -> c)) -> let - xs = VG.map (\row -> funcApply @c row f) $ V.generate (fst (dimensions df)) (mkRowFromArgs args df) - in insertColumn name xs df - (F1 (f' :: a -> b)) -> let - xs = VG.map (\row -> funcApply @b row f) $ V.generate (fst (dimensions df)) (mkRowFromArgs args df) - in insertColumn name xs df +derive :: forall a . Columnable a => T.Text -> Expr a -> DataFrame -> DataFrame +derive name expr df = let + value = interpret @a df expr + in insertColumn' name (Just (unwrapTypedColumn value)) df --- | O(k) Apply a function to a given column in a dataframe and --- add the result into alias column. - -derive :: - forall b c. - (Columnable b, Columnable c) => - -- | New name - T.Text -> - -- | function to apply - (b -> c) -> - -- | Derivative column name - T.Text -> - -- | DataFrame to apply operation to - DataFrame -> - DataFrame -derive alias f columnName d = case getColumn columnName d of - Nothing -> throw $ ColumnNotFoundException columnName "derive" (map fst $ M.toList $ columnIndices d) - Just column -> case transform f column of - Nothing -> throw $ TypeMismatchException (typeOf column) (typeRep @b) columnName "derive" - Just res -> insertColumn' alias (Just res) d -- | O(k * n) Apply a function to given column names in a dataframe. applyMany :: diff --git a/tests/Operations/Derive.hs b/tests/Operations/Derive.hs index bb7fb209..d6a46f0a 100644 --- a/tests/Operations/Derive.hs +++ b/tests/Operations/Derive.hs @@ -23,13 +23,13 @@ values = [ ("test1", DI.toColumn ([1..26] :: [Int])) testData :: D.DataFrame testData = D.fromList values -deriveFromWAI :: Test -deriveFromWAI = TestCase (assertEqual "deriveFrom works when function args align" +deriveWAI :: Test +deriveWAI = TestCase (assertEqual "derive works with column expression" (Just $ DI.BoxedColumn (V.fromList (zipWith (\n c -> show n ++ [c]) [1..26] ['a'..'z']))) - (DI.getColumn "test4" $ D.deriveFrom ( - ["test1", "test3"], - D.func (\(n :: Int) (c :: Char) -> show n ++ [c])) "test4" testData)) + (DI.getColumn "test4" $ D.derive "test4" ( + D.lift2 (++) (D.lift show (D.col @Int "test1")) (D.lift (: ([] :: [Char])) (D.col @Char "test3")) + ) testData)) tests :: [Test] -tests = [ TestLabel "deriveFromWAI" deriveFromWAI +tests = [ TestLabel "deriveWAI" deriveWAI ] \ No newline at end of file