mem

package

v0.0.3 Latest Latest Go to latest Published: May 6, 2025 License: Apache-2.0 Imports: 18 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/invertedv/df

Links

Open Source Insights

Documentation ¶

Index ¶

func StandardFunctions() d.Fns
type Col
- func NewCol(data any, opts ...d.ColOpt) (*Col, error)
type DF

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func StandardFunctions ¶

func StandardFunctions() d.Fns

StandardFunctions returns the built-in functions for in-memory data to be used by Parser.

Types ¶

type Col ¶

type Col struct {
	*d.Vector

	*d.ColCore
}

Col implements Column for in-memory data.

func NewCol ¶

func NewCol(data any, opts ...d.ColOpt) (*Col, error)

NewCol creates a new mem.Column from data.

Example ¶

Create a new column from a *Vector.

const n = 100

x := make([]int64, n)
for ind := range n {
	x[ind] = int64(ind * 2)
}

var (
	v  *d.Vector
	e1 error
)
// NewVector will convert the type.
if v, e1 = d.NewVector(x, d.DTint); e1 != nil {
	panic(e1)
}

// Note, calling NewCol with x will generate an error since x is not of type int
// and NewCol does not convert types.
var (
	col *Col
	e2  error
)
if col, e2 = NewCol(v, d.ColName("x")); e2 != nil {
	panic(e2)
}

fmt.Println(col.AsAny().([]int)[0:10])

Output:

[0 2 4 6 8 10 12 14 16 18]

func (*Col) AllRows ¶

func (c *Col) AllRows() iter.Seq2[int, []any]

func (*Col) Copy ¶

func (c *Col) Copy() d.Column

func (*Col) Data ¶

func (c *Col) Data() *d.Vector

func (*Col) String ¶

func (c *Col) String() string

type DF ¶

type DF struct {
	*d.DFcore
	// contains filtered or unexported fields
}

DF implements DF for in-memory data.

func DBload ¶

func DBload(qry string, dlct *d.Dialect, opts ...d.DFopt) (*DF, error)

DBload loads a *DF from a query.

Example ¶

Connect to ClickHouse and pull the data from a query. Note that this code is identical to the DBload example in df/sql. The mem/df package loads the data into memory, the sql/df package does not.

const (
	dbProvider = "clickhouse"
	chTable    = "testing.d1"
)

// ClickHouse connection parameters.
user := os.Getenv("user")
host := os.Getenv("host")
password := os.Getenv("password")
db := newConnectCH(host, user, password)

qry := "SELECT k, x FROM " + chTable

var (
	dlct *d.Dialect
	e    error
)
if dlct, e = d.NewDialect(dbProvider, db); e != nil {
	panic(e)
}

var (
	df *DF
	e1 error
)
if df, e1 = DBload(qry, dlct); e1 != nil {
	panic(e1)
}

fmt.Println("# of Rows: ", df.RowCount())
fmt.Println("Columns: ", df.ColumnNames())

Output:

# of Rows:  6
Columns:  [k x]

func FileLoad ¶

func FileLoad(f *d.Files, opts ...d.DFopt) (*DF, error)

*FileLoad loads a *DF from a *d.Files struct.

Example ¶

Load a CSV with a header. Column types are determined by peeking at the data.

var (
	f  *d.Files
	e1 error
)
if f, e1 = d.NewFiles(d.FileStrict(true)); e1 != nil {
	panic(e1)
}

// this file is in df/data.
fileToOpen := os.Getenv("datapath") + "d1.csv"
if ex := f.Open(fileToOpen); ex != nil {
	panic(ex)
}

var (
	df *DF
	e2 error
)
if df, e2 = FileLoad(f); e2 != nil {
	panic(e2)
}

fmt.Println("# of Rows: ", df.RowCount())
fmt.Println("Columns: ", df.ColumnNames())

Output:

# of Rows:  6
Columns:  [k x y yy z dt R]

Example (Types) ¶

Load a CSV with a header. Column names & types are specified by user. The source .CSV has a header, which is skipped. Note, if you specify types, you must also specify names.

// ordered as in the file
fieldNames := []string{"k", "x", "y", "yy", "z", "dt", "RNew"}
fieldTypes := []d.DataTypes{d.DTint, d.DTfloat, d.DTint, d.DTint, d.DTstring, d.DTdate, d.DTfloat}

var (
	f  *d.Files
	e1 error
)
if f, e1 = d.NewFiles(d.FileFieldNames(fieldNames), d.FileFieldTypes(fieldTypes)); e1 != nil {
	panic(e1)
}

fileToOpen := os.Getenv("datapath") + "d1.csv"
if ex := f.Open(fileToOpen); ex != nil {
	panic(ex)
}

var (
	df *DF
	e2 error
)
if df, e2 = FileLoad(f); e2 != nil {
	panic(e2)
}

ct, _ := df.ColumnTypes()
fmt.Println(ct)

Output:

[DTint DTfloat DTint DTint DTstring DTdate DTfloat]

func NewDF ¶

func NewDF(input any, opts ...d.DFopt) (*DF, error)

NewDF creates a *DF from input.

input - can be (in order of what's tried)
  - *DF. This is copied.
  - *Col. This is copied.
  - d.Column.
  - *Vector.
  - HasMQdlct. The query is run to fetch the data.
  - d.DF. The data is pulled to construct the output.

func NewDFcol ¶

func NewDFcol(cols []*Col, opts ...d.DFopt) (*DF, error)

NewDFcol creates a DF from *mem.Col.

Example ¶

Create columns from slices and then create a new dataframe from them

const n = 100

x := make([]int, n)
y := make([]float64, n)
for ind := range n {
	x[ind] = ind * 2
	y[ind] = float64(x[ind])
}

var (
	col1, col2 *Col
	e1         error
)

if col1, e1 = NewCol(x, d.ColName("x")); e1 != nil {
	panic(e1)
}

if col2, e1 = NewCol(x, d.ColName("y")); e1 != nil {
	panic(e1)
}

var (
	df *DF
	e2 error
)
if df, e2 = NewDFcol([]*Col{col1, col2}); e2 != nil {
	panic(e2)
}

var (
	xf []float64
	e3 error
)
// This will convert x to a float64.
if xf, e3 = df.Column("x").Data().AsFloat(); e3 != nil {
	panic(e3)
}

fmt.Println(xf[0:10])

Output:

[0 2 4 6 8 10 12 14 16 18]

func NewDFseq ¶

func NewDFseq(n int, name string, opts ...d.DFopt) (*DF, error)

NewDFseq creates a *DF with a single column, name. That column is a DTint sequence from 0 to n-1.

func (*DF) AllRows ¶

func (f *DF) AllRows() iter.Seq2[int, []any]

AllRows iterates through the rows of the column. It returns the row # and the values of f that row.

func (*DF) AppendColumn ¶

func (f *DF) AppendColumn(col d.Column, replace bool) error

AppendColumn masks the DFcore version so that we can handle appending scalars

Example ¶

Append a column to a dataframe

const (
	n    = 100
	slen = 4
)

var (
	df *DF
	e1 error
)
if df, e1 = NewDFseq(n, "seq"); e1 != nil {
	panic(e1)
}

x := make([]string, n)
for ind := range n {
	x[ind] = d.RandomLetters(4)
}

// create a column named "x" from x.
var (
	col *Col
	e2  error
)
if col, e2 = NewCol(x, d.ColName("x")); e2 != nil {
	panic(e2)
}

if e := df.AppendColumn(col, false); e != nil {
	panic(e)
}

fmt.Println(df.ColumnNames())

Output:

[seq x]

func (*DF) AppendDF ¶

func (f *DF) AppendDF(df d.DF) (d.DF, error)

func (*DF) By ¶

func (f *DF) By(groupBy string, fns ...string) (d.DF, error)

By creates a new *DF with function fns calculated within the groups defined by groupBy.

groupBy - comma-separated list of fields to group on.  If groupBy is empty, then the output will have 1 row.
fns     - functions to calculate on the By groups.

Example ¶

Create a new table grouping one one column with two summary columns.

const n = 1000

// create source dataframe.
x := make([]int, n)
y := make([]float64, n)
for ind := range n {
	x[ind] = ind % 4
	y[ind] = float64(ind)
}

var (
	cx, cy *Col
	e0     error
)
if cx, e0 = NewCol(x, d.ColName("x")); e0 != nil {
	panic(e0)
}
if cy, e0 = NewCol(y, d.ColName("y")); e0 != nil {
	panic(e0)
}

var (
	df *DF
	e1 error
)
if df, e1 = NewDFcol([]*Col{cx, cy}); e1 != nil {
	panic(e1)
}
var (
	dfBy d.DF
	e2   error
)

// produce a summary
if dfBy, e2 = df.By("x", "my := mean(y)", "sy := sum(y)"); e2 != nil {
	panic(e2)
}

if e := dfBy.Sort(true, "x"); e != nil {
	panic(e)
}

fmt.Println(dfBy.Column("x").Data().AsAny())
fmt.Println(dfBy.Column("my").Data().AsAny())
fmt.Println(dfBy.Column("sy").Data().AsAny())

Output:

[0 1 2 3]
[498 499 500 501]
[124500 124750 125000 125250]

Example (Global) ¶

Create a summary table that requires a global summary in the calculation.

const n = 1000

// create source dataframe.
x := make([]int, n)
y := make([]float64, n)
for ind := range n {
	x[ind] = ind % 4
	y[ind] = float64(ind)
}

var (
	cx, cy *Col
	e0     error
)
if cx, e0 = NewCol(x, d.ColName("x")); e0 != nil {
	panic(e0)
}
if cy, e0 = NewCol(y, d.ColName("y")); e0 != nil {
	panic(e0)
}

var (
	df *DF
	e1 error
)
if df, e1 = NewDFcol([]*Col{cx, cy}); e1 != nil {
	panic(e1)
}
var (
	dfBy d.DF
	e2   error
)
// produce a summary
if dfBy, e2 = df.By("x", "cnt := count(x)", "total := count(global(x))", "prop := 100.0 * float(cnt)/float(total)"); e2 != nil {
	panic(e2)
}
//	if dfBy, e2 = df.By("x", "cnt := count(x)", "prop := float(cnt)/float(count(global(x)))"); e2 != nil {
//		panic(e2)
//	}

if e := dfBy.Sort(true, "x"); e != nil {
	panic(e)
}

fmt.Println(dfBy.Column("x").Data().AsAny())
fmt.Println(dfBy.Column("cnt").Data().AsAny())
fmt.Println(dfBy.Column("total").Data().AsAny())
fmt.Println(dfBy.Column("prop").Data().AsAny())
//

Output:

[0 1 2 3]
[250 250 250 250]
[1000 1000 1000 1000]
[25 25 25 25]

Example (OneRow) ¶

Create a summary with no grouping column.

const n = 1000

// create source dataframe.
x := make([]int, n)
y := make([]float64, n)
for ind := range n {
	x[ind] = ind % 4
	y[ind] = float64(ind)
}

var (
	cx, cy *Col
	e0     error
)
if cx, e0 = NewCol(x, d.ColName("x")); e0 != nil {
	panic(e0)
}
if cy, e0 = NewCol(y, d.ColName("y")); e0 != nil {
	panic(e0)
}

var (
	df *DF
	e1 error
)
if df, e1 = NewDFcol([]*Col{cx, cy}); e1 != nil {
	panic(e1)
}
var (
	dfBy d.DF
	e2   error
)
// produce a summary
if dfBy, e2 = df.By("", "cnt := count(y)", "sy := sum(y)"); e2 != nil {
	panic(e2)
}

fmt.Println(dfBy.Column("cnt").Data().AsAny())
fmt.Println(dfBy.Column("sy").Data().AsAny())

Output:

[1000]
[499500]

Example (TwoColumns) ¶

Create a new table grouping on two columns with two summary columns.

const n = 1000

// create source dataframe.
x := make([]int, n)
r := make([]int, n)
y := make([]float64, n)
for ind := range n {
	x[ind] = ind % 4
	r[ind] = ind % 8
	y[ind] = float64(ind)
}

var (
	cx, cr, cy *Col
	e0         error
)
if cx, e0 = NewCol(x, d.ColName("x")); e0 != nil {
	panic(e0)
}
if cr, e0 = NewCol(r, d.ColName("r")); e0 != nil {
	panic(e0)
}
if cy, e0 = NewCol(y, d.ColName("y")); e0 != nil {
	panic(e0)
}

var (
	df *DF
	e1 error
)
if df, e1 = NewDFcol([]*Col{cx, cr, cy}); e1 != nil {
	panic(e1)
}
var (
	dfBy d.DF
	e2   error
)

// produce a summary
if dfBy, e2 = df.By("x,r", "my := mean(y)", "sy := sum(y)"); e2 != nil {
	panic(e2)
}

if e := dfBy.Sort(true, "x,r"); e != nil {
	panic(e)
}

fmt.Println(dfBy.Column("x").Data().AsAny())
fmt.Println(dfBy.Column("r").Data().AsAny())
fmt.Println(dfBy.Column("my").Data().AsAny())
fmt.Println(dfBy.Column("sy").Data().AsAny())

Output:

[0 0 1 1 2 2 3 3]
[0 4 1 5 2 6 3 7]
[496 500 497 501 498 502 499 503]
[62000 62500 62125 62625 62250 62750 62375 62875]

func (*DF) Categorical ¶

func (f *DF) Categorical(colName string, catMap d.CategoryMap, fuzz int, defaultVal any, levels []any) (d.Column, error)

Categorical produces a categorical column from a source column.

colName    - name of the source column
catMap     - optionally supply a category map of source value -> category level
fuzz       - if a source column value has counts < fuzz, then it is put in the 'other' category.
defaultVal - optional source column value for the 'other' category.
levels     - slice of source values to make categories from

func (*DF) Copy ¶

func (f *DF) Copy() d.DF

func (*DF) Interp ¶

func (f *DF) Interp(points d.HasIter, xSfield, xIfield, yfield, outField string) (d.DF, error)

Interp interpolates the columns (xIfield,yfield) at xsField points.

points   - input iterator (e.g. Column or DF) that yields the points to interpolate at
xSfield  - column name of x values in source DF
xIfield  - name of x values in iDF
yfield   - column name of y values in source DF
outField - column name of interpolated y's in return DF

The output DF is restricted to interpolated points that lie within the data. It has columns:

xIfield  - points at which to interpolate. This may be a subset of the input "points".
outField - interpolated values.

Example ¶

const n1 = 10

// create first dataframe.
x := make([]float64, n1)
y := make([]float64, n1)
for ind := range n1 {
	x[ind] = float64(ind)
	y[ind] = float64(ind) * 4
}

var (
	cx1, cy1 *Col
	e0       error
)
if cx1, e0 = NewCol(x, d.ColName("x")); e0 != nil {
	panic(e0)
}
if cy1, e0 = NewCol(y, d.ColName("y")); e0 != nil {
	panic(e0)
}

var (
	df1 *DF
	e1  error
)
if df1, e1 = NewDFcol([]*Col{cx1, cy1}); e1 != nil {
	panic(e1)
}

cxi := []float64{0.5, 4.25, -1, 20, 6.8}
coli, _ := NewCol(cxi, d.ColName("xi"))

dfOut, _ := df1.Interp(coli, "x", "xi", "y", "yInterp")
fmt.Println(dfOut.Column("yInterp").Data().AsAny())

Output:

[2 17 27.2]

func (*DF) Join ¶

func (f *DF) Join(df d.HasIter, joinOn string) (d.DF, error)

Join joins f and df on the columns of joinOn. This is an inner join.

df - data to join.
joinOn - comma-separated list of fields to join on.  These fields must have the same name in both data sets.

Example ¶

const (
	n1 = 10
	n2 = 15
)

// create first dataframe.
x := make([]int, n1)
y := make([]float64, n1)
for ind := range n1 {
	x[ind] = ind
	y[ind] = float64(ind) * 4
}

var (
	cx1, cy1 *Col
	e0       error
)
if cx1, e0 = NewCol(x, d.ColName("x")); e0 != nil {
	panic(e0)
}
if cy1, e0 = NewCol(y, d.ColName("y")); e0 != nil {
	panic(e0)
}

var (
	df1 *DF
	e1  error
)
if df1, e1 = NewDFcol([]*Col{cx1, cy1}); e1 != nil {
	panic(e1)
}

// create second dataframe.
x = make([]int, n2)
z := make([]float64, n2)
for ind := range n2 {
	x[ind] = ind
	z[ind] = -float64(ind) * 4
}

var (
	cx2, cz2 *Col
	e2       error
)
if cx2, e2 = NewCol(x, d.ColName("x")); e2 != nil {
	panic(e2)
}
if cz2, e2 = NewCol(z, d.ColName("z")); e2 != nil {
	panic(e2)
}

var (
	df2 *DF
	e3  error
)
if df2, e3 = NewDFcol([]*Col{cx2, cz2}); e3 != nil {
	panic(e3)
}

var (
	dfJoin d.DF
	e4     error
)
if dfJoin, e4 = df1.Join(df2, "x"); e4 != nil {
	panic(e4)
}
fmt.Println(dfJoin.Column("x").Data().AsAny())
fmt.Println(dfJoin.Column("y").Data().AsAny())
fmt.Println(dfJoin.Column("z").Data().AsAny())

Output:

[0 1 2 3 4 5 6 7 8 9]
[0 4 8 12 16 20 24 28 32 36]
[-0 -4 -8 -12 -16 -20 -24 -28 -32 -36]

Example (TwoColumns) ¶

Join based on two columns. Compare to the same example under df/sql.

const (
	nLeft      = 10
	nRight     = 15
	dbProvider = "clickhouse"
)

var (
	dfLeft, dfRight d.DF
	e1              error
)
if dfLeft, e1 = NewDFseq(nLeft, "seq"); e1 != nil {
	panic(e1)
}

if dfRight, e1 = NewDFseq(nRight, "seq"); e1 != nil {
	panic(e1)
}

// second column to join on
if e := d.Parse(dfLeft, "b := if(mod(seq,4) == 0, 'a', if(mod(seq,4)==1, 'b', if(mod(seq,4)==2, 'c', 'd')))"); e != nil {
	panic(e)
}

if e := d.Parse(dfRight, "b := if(mod(seq,4) == 0, 'a', 'b')"); e != nil {
	panic(e)
}

// add another column to each
if e := d.Parse(dfLeft, "x := exp(float(seq) / 100.0)"); e != nil {
	panic(e)
}

if e := d.Parse(dfRight, "y := seq^2"); e != nil {
	panic(e)
}

var (
	dfJoin d.DF
	e2     error
)

if dfJoin, e2 = dfLeft.Join(dfRight, "seq,b"); e2 != nil {
	panic(e2)
}

fmt.Println(dfJoin.RowCount())
fmt.Println(dfJoin.Column("seq").Data().AsAny())
fmt.Println(dfJoin.Column("b").Data().AsAny())
fmt.Println(dfJoin.Column("y").Data().AsAny())

Output:

6
[0 1 4 5 8 9]
[a b a b a b]
[0 1 16 25 64 81]

func (*DF) Len ¶

func (f *DF) Len() int

Len is required for sort

func (*DF) Less ¶

func (f *DF) Less(i, j int) bool

Less returns true if row i < row j when sorting by the orderBy field of f

func (*DF) Row ¶

func (f *DF) Row(rowNum int) []any

Row returns the rowNum row of f

func (*DF) RowCount ¶

func (f *DF) RowCount() int

RowCount returns # of rows in f

func (*DF) SetParent ¶

func (f *DF) SetParent() error

SetParent sets the parent to f for all the columns in f.

func (*DF) Sort ¶

func (f *DF) Sort(ascending bool, sortCols string) error

Sort sorts f according to sortCols. ascending - true = sort ascending sortCols - comma-separated list of columns to sort on.

func (*DF) SourceQuery ¶

func (f *DF) SourceQuery() string

SourceQuery returns the query used to load f, if any.

func (*DF) String ¶

func (f *DF) String() string

String produces a summary of f.

func (*DF) Swap ¶

func (f *DF) Swap(i, j int)

Swap swaps rows i and j.

func (*DF) Table ¶

func (f *DF) Table(cols string) (d.DF, error)

Table produces a table based on cols. cols is a comma-separated list of fields. The metrics within each group calculated are:

n    - count of rows
rate - fraction of original row count.

func (*DF) Where ¶

func (f *DF) Where(condition string) (d.DF, error)

Where subsets f to rows where condition is true.

Example ¶

const n1 = 10

// create dataframe.
x := make([]int, n1)
y := make([]float64, n1)
for ind := range n1 {
	x[ind] = ind
	y[ind] = float64(ind) * 4
}

var (
	cx1, cy1 *Col
	e0       error
)
if cx1, e0 = NewCol(x, d.ColName("x")); e0 != nil {
	panic(e0)
}
if cy1, e0 = NewCol(y, d.ColName("y")); e0 != nil {
	panic(e0)
}

var (
	df1 *DF
	e1  error
)
if df1, e1 = NewDFcol([]*Col{cx1, cy1}); e1 != nil {
	panic(e1)
}

// subset to where x < 4 or x > 8
dfOut, _ := df1.Where("x < 4 || x > 8")
fmt.Println(dfOut.Column("x").Data().AsAny())

Output:

[0 1 2 3 9]

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL