diff --git a/apophenia/README.md b/apophenia/README.md deleted file mode 100644 index 51ef4ec..0000000 --- a/apophenia/README.md +++ /dev/null @@ -1,92 +0,0 @@ -# Apophenia -- seeking patterns in randomness - -Apophenia provides an approximate emulation of a seekable pseudo-random -number generator. You provide a seed, and get a generator which can generate -a large number of pseudo-random bits which will occur in a predictable -pattern, but you can seek anywhere in that pattern in constant time. - -Apophenia's interface is intended to be similar to that of stdlib's -`math/rand`. In fact, the `Sequence` interface type is a strict superset -of `rand.Source`. - -## Implementation Notes - -AES-128, for a given seed, consists of a PRNG which, from 2^128 input -coordinates, produces 2^128 possible outputs, each of which is 128 bits -long. This is equivalent to a single-bit PRNG of period 2^135. It is -also possible to treat the input 128 bits as a coordinate system of -some sort, to allow multiple parallel sequences, etcetera. - -This design may have serious fundamental flaws, but it worked out in -light testing and I'm an optimist. - -### Sequences and Offsets - -Apophenia's underlying implementation admits 128-bit keys, and 128-bit -offsets within each sequence. In most cases: - - * That's more space than we need. - * Working with a non-native type for item numbers is annoying, - but 64-bits is enough range. - * It would be nice to avoid using the *same* pseudo-random values - for different things. - * Even when those things have the same basic identifying ID or - value. - -For instance, say you wish to generate a billion items. Each item should -have several "random" values. Some values might follow a Zipf distribution, -others might just be "U % N" for some N. If you use the item number as a -key, and seek to the same position for each of these, and get the same bits -for each of these, you may get unintended similarities or correlations between -them. - -With this in mind, apophenia divides its 128-bit offset space into a number -of spaces. The most significant bits are used for a sequence-type value, one -of: - - * SequenceDefault - * SequencePermutationK/SequencePermutationF: permutations - * SequenceWeighted: weighted bits - * SequenceLinear: linear values within a range - * SequenceZipfU: uniforms to use for Zipf values - * SequenceRandSource: default offsets for the rand.Source - * SequenceUser1/SequenceUser2: reserved for non-apophenia usage - -Other values are not yet defined, but are reserved. - -Within most of these spaces, the rest of the high word of the offset is used -for a 'seed' (used to select different sequences) and an 'iteration' (used -for successive values consumed by an algorithm). The low-order word is treated -as a 64-bit item ID. - -``` -High-order word: -0 1 2 3 -0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef -[iteration ][seed ][seq ] -Low-order word: -0 1 2 3 -0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef -[id ] -``` - -The convenience function `OffsetFor(sequence, seed, iteration, id)` -supports this usage. - -As a side effect, if generating additional values for a given seed and -id, you can increment the high-order word of the `Uint128`, -and if generating values for a new id, you can increment the low-order -word. If your algorithm consumes more than 2^24 values for a single -operation, you could start hitting values shared with other seeds. Oh, -well. - -#### Iteration usage - -For the built-in consumers: - - * Weighted consumes log2(scale) iterated values. - * Zipf consumes an *average* of no more than about 1.1 values. - * Permutation consumes one iterated value per 128 rounds of permutation, - where rounds is equal to `6*ceil(log2(max))`. (For instance, a second - value is consumed around a maximum of 2^22, and a third around 2^43.) - * Nothing else uses more than one iterated value. diff --git a/apophenia/apophenia.go b/apophenia/apophenia.go deleted file mode 100644 index 0749510..0000000 --- a/apophenia/apophenia.go +++ /dev/null @@ -1,117 +0,0 @@ -// Package apophenia provides seekable pseudo-random numbers, allowing -// reproducibility of pseudo-random results regardless of the order they're -// generated in. -package apophenia - -import ( - "crypto/aes" - "crypto/cipher" - "encoding/binary" - "math/rand" -) - -// Sequence represents a specific deterministic but pseudo-random-ish series -// of bits. A Sequence can be used as a `rand.Source` or `rand.Source64` -// for `math/rand`. -type Sequence interface { - rand.Source - Seek(Uint128) Uint128 - BitsAt(Uint128) Uint128 -} - -// aesSequence128 implements Sequence on top of an AES block cipher. -type aesSequence128 struct { - key [16]byte - cipher cipher.Block - plainText, cipherText [16]byte - offset Uint128 - err error -} - -// NewSequence generates a sequence initialized with the given seed. -func NewSequence(seed int64) Sequence { - s := aesSequence128{offset: OffsetFor(SequenceRandSource, 0, 0, 0)} - s.Seed(seed) - if s.err != nil { - panic("impossible error: " + s.err.Error()) - } - return &s -} - -// Seed sets the generator to a known state. -func (s *aesSequence128) Seed(seed int64) { - var newKey [16]byte - binary.LittleEndian.PutUint64(newKey[:8], uint64(seed)) - newCipher, err := aes.NewCipher(newKey[:]) - if err != nil { - // we can't return an error, because Seed() can't fail. also - // note that this can't actually happen, supposedly. - s.err = err - return - } - copy(s.key[:], newKey[:]) - s.cipher = newCipher - s.offset = Uint128{0, 0} -} - -// Int63 returns a value in 0..(1<<63)-1. -func (s *aesSequence128) Int63() int64 { - return int64(s.Uint64() >> 1) -} - -// Uint64 returns a value in 0..(1<<64)-1. -func (s *aesSequence128) Uint64() uint64 { - out := s.BitsAt(s.offset) - s.offset.Inc() - return out.Lo -} - -// SequenceClass denotes one of the sequence types, which are used to allow -// sequences to avoid hitting each other's pseudo-random results. -type SequenceClass uint8 - -const ( - // SequenceDefault is the zero value, used if you didn't think to pick one. - SequenceDefault SequenceClass = iota - // SequencePermutationK is the K values for the permutation algorithm. - SequencePermutationK - // SequencePermutationF is the F values for the permutation algorithm. - SequencePermutationF - // SequenceWeighted is used to generate weighted values for a given - // position. - SequenceWeighted - // SequenceLinear is the random numbers for U%N type usage. - SequenceLinear - // SequenceZipfU is the random numbers for the Zipf computations. - SequenceZipfU - // SequenceRandSource is used by default when a Sequence is being - // used as a rand.Source. - SequenceRandSource - // SequenceUser1 is eserved for non-apophenia package usage. - SequenceUser1 - // SequenceUser2 is reserved for non-apophenia package usage. - SequenceUser2 -) - -// OffsetFor determines the Uint128 offset for a given class/seed/iteration/id. -func OffsetFor(class SequenceClass, seed uint32, iter uint32, id uint64) Uint128 { - return Uint128{Hi: (uint64(class) << 56) | (uint64(seed) << 24) | uint64(iter), - Lo: id} -} - -// Seek seeks to the specified offset, yielding the previous offset. This -// sets the stream to a specific point in its cycle, affecting future calls -// to Int63 or Uint64. -func (s *aesSequence128) Seek(offset Uint128) (old Uint128) { - old, s.offset = s.offset, offset - return old -} - -// BitsAt yields the sequence of bits at the provided offset into the stream. -func (s *aesSequence128) BitsAt(offset Uint128) (out Uint128) { - binary.LittleEndian.PutUint64(s.plainText[:8], offset.Lo) - binary.LittleEndian.PutUint64(s.plainText[8:], offset.Hi) - s.cipher.Encrypt(s.cipherText[:], s.plainText[:]) - out.Lo, out.Hi = binary.LittleEndian.Uint64(s.cipherText[:8]), binary.LittleEndian.Uint64(s.cipherText[8:]) - return out -} diff --git a/apophenia/int128.go b/apophenia/int128.go deleted file mode 100644 index db3e146..0000000 --- a/apophenia/int128.go +++ /dev/null @@ -1,181 +0,0 @@ -package apophenia - -import "fmt" - -// Uint128 is an array of 2 uint64, treated as a single -// object to simplify calling conventions. -type Uint128 struct { - Lo, Hi uint64 // low-order and high-order uint64 words. Value is ``(Hi << 64) | Lo`. -} - -// Add adds value to its receiver in place. -func (u *Uint128) Add(value Uint128) { - u.Lo += value.Lo - if u.Lo < value.Lo { - u.Hi++ - } - u.Hi += value.Hi -} - -// Sub subtracts value from its receiver in place. -func (u *Uint128) Sub(value Uint128) { - u.Lo -= value.Lo - if u.Lo > value.Lo { - u.Hi-- - } - u.Hi -= value.Hi -} - -// And does a bitwise and with value, in place. -func (u *Uint128) And(value Uint128) { - u.Lo, u.Hi = u.Lo&value.Lo, u.Hi&value.Hi -} - -// Or does a bitwise or with value, in place. -func (u *Uint128) Or(value Uint128) { - u.Lo, u.Hi = u.Lo|value.Lo, u.Hi|value.Hi -} - -// Xor does a bitwise xor with value, in place. -func (u *Uint128) Xor(value Uint128) { - u.Lo, u.Hi = u.Lo^value.Lo, u.Hi^value.Hi -} - -// Not does a bitwise complement in place. -func (u *Uint128) Not() { - u.Lo, u.Hi = ^u.Lo, ^u.Hi -} - -// Mask produces a mask of the lower n bits of u. -func (u *Uint128) Mask(n uint64) { - if n >= 128 { - return - } - if n >= 64 { - u.Hi &= (1 << (n & 63)) - 1 - } else { - u.Lo &= (1 << n) - 1 - } -} - -// Mask produces a bitmask with n bits set. -func Mask(n uint64) (u Uint128) { - if n >= 128 { - u.Not() - return u - } - if n >= 64 { - u.Lo-- - u.Hi = (1 << n & 63) - 1 - return u - } - u.Lo = (1 << n) - 1 - return u -} - -// String provides a string representation. -func (u Uint128) String() string { - return fmt.Sprintf("0x%x%016x", u.Hi, u.Lo) -} - -// bit rotation: for 1-63 bits, we are moving the low-order N bits of u.Lo -// into the high-order N bits of u.Hi, and vice versa. For 64-127, it's that -// plus swapping u.Lo and u.Hi. - -// RotateRight rotates u right by n bits. -func (u *Uint128) RotateRight(n uint64) { - if n&64 != 0 { - u.Lo, u.Hi = u.Hi, u.Lo - } - n &= 63 - if n == 0 { - return - } - unbits := 64 - n - - u.Lo, u.Hi = (u.Lo>>n)|(u.Hi<>n)|(u.Lo<>unbits), (u.Hi<>unbits) -} - -// ShiftRight shifts u right by n bits. -func (u *Uint128) ShiftRight(n uint64) { - if n > 127 { - u.Lo, u.Hi = 0, 0 - return - } - if n >= 64 { - u.Lo, u.Hi = u.Hi>>(n&63), 0 - return - } - unbits := 64 - n - - u.Lo, u.Hi = (u.Lo>>n)|(u.Hi<> n) -} - -// ShiftRightCarry returns both the shifted value and the bits that -// were shifted out. Useful for when you want both x%N and x/N for -// N a power of 2. Only sane if bits <= 64. -func (u *Uint128) ShiftRightCarry(n uint64) (out Uint128, carry uint64) { - if n > 64 { - return out, carry - } - if n == 64 { - out.Lo, carry = u.Hi, u.Lo - return out, carry - } - unbits := 64 - n - - out.Lo, out.Hi, carry = (u.Lo>>n)|(u.Hi<> n), u.Lo&((1< 127 { - u.Lo, u.Hi = 0, 0 - return - } - if n >= 64 { - u.Lo, u.Hi = 0, u.Lo<<(n&63) - return - } - n &= 63 - if n == 0 { - return - } - unbits := 64 - n - - u.Lo, u.Hi = (u.Lo << n), (u.Hi<>unbits) -} - -// Bit returns 1 if the nth bit is set, 0 otherwise. -func (u *Uint128) Bit(n uint64) uint64 { - if n >= 128 { - return 0 - } - if n >= 64 { - return (u.Hi >> (n & 63)) & 1 - } - return (u.Lo >> n) & 1 -} - -// Inc increments its receiver in place. -func (u *Uint128) Inc() { - u.Lo++ - if u.Lo == 0 { - u.Hi++ - } -} diff --git a/apophenia/int128_test.go b/apophenia/int128_test.go deleted file mode 100644 index 411c4a9..0000000 --- a/apophenia/int128_test.go +++ /dev/null @@ -1,58 +0,0 @@ -package apophenia - -import ( - "testing" -) - -func Test_Int128Rotate(t *testing.T) { - cases := []struct { - in Uint128 - bits uint64 - outL, outR Uint128 - }{ - {in: Uint128{Lo: 0x1}, bits: 1, outR: Uint128{Lo: 0x0, Hi: 1 << 63}, outL: Uint128{Lo: 0x2, Hi: 0}}, - {in: Uint128{Lo: 0x11}, bits: 4, outR: Uint128{Lo: 1, Hi: 1 << 60}, outL: Uint128{Lo: 0x110, Hi: 0}}, - {in: Uint128{Lo: 0x11}, bits: 65, outR: Uint128{Lo: 1 << 63, Hi: 8}, outL: Uint128{Lo: 0, Hi: 0x22}}, - } - for _, c := range cases { - u := c.in - u.RotateRight(c.bits) - if u != c.outR { - t.Fatalf("rotate %s right by %d: expected %s, got %s", - c.in, c.bits, c.outR, u) - } - u = c.in - u.RotateLeft(c.bits) - if u != c.outL { - t.Fatalf("rotate %s left by %d: expected %s, got %s", - c.in, c.bits, c.outL, u) - } - } -} - -func Test_Int128Shift(t *testing.T) { - cases := []struct { - in Uint128 - bits uint64 - outL, outR Uint128 - }{ - {in: Uint128{Lo: 0x1}, bits: 1, outR: Uint128{Lo: 0x0, Hi: 0}, outL: Uint128{Lo: 0x2, Hi: 0}}, - {in: Uint128{Lo: 0x11}, bits: 4, outR: Uint128{Lo: 1, Hi: 0}, outL: Uint128{Lo: 0x110, Hi: 0}}, - {in: Uint128{Lo: 0x11, Hi: 0x3}, bits: 65, outR: Uint128{Lo: 1, Hi: 0}, outL: Uint128{Lo: 0, Hi: 0x22}}, - {in: Uint128{Lo: 0, Hi: 0x11}, bits: 68, outR: Uint128{Lo: 1, Hi: 0}, outL: Uint128{Lo: 0, Hi: 0}}, - } - for _, c := range cases { - u := c.in - u.ShiftRight(c.bits) - if u != c.outR { - t.Fatalf("shift %s right by %d: expected %s, got %s", - c.in, c.bits, c.outR, u) - } - u = c.in - u.ShiftLeft(c.bits) - if u != c.outL { - t.Fatalf("shift %s left by %d: expected %s, got %s", - c.in, c.bits, c.outL, u) - } - } -} diff --git a/apophenia/permute.go b/apophenia/permute.go deleted file mode 100644 index 9a78f8d..0000000 --- a/apophenia/permute.go +++ /dev/null @@ -1,127 +0,0 @@ -package apophenia - -import ( - "errors" - "math/bits" -) - -// PermutationGenerator provides a way to pass integer IDs through a permutation -// map that is pseudorandom but repeatable. This could be done with rand.Perm, -// but that would require storing a slice of [Items]int64, which we want to avoid -// for large values of Items. -// -// Not actually cryptographically secure. -type Permutation struct { - src Sequence - permSeed uint32 - max int64 - counter int64 - rounds int - bits Uint128 - k []uint64 -} - -// Design notes: -// -// This is based on: -// http://arxiv.org/abs/1208.1176v2 -// -// This simulates the results of a shuffle in a way allowing a lookup of -// the results of the shuffle for any given position, in time proportional -// to a number of "rounds", each of which is 50% likely to swap a slot -// with another slot. The number of rounds needed to achieve a reasonable -// probability of safety is log(N)*6 or so. -// -// Each permutation is fully defined by a "key", consisting of: -// 1. A key "KF" naming a value in [0,max) for each round. -// 2. A series of round functions mapping values in [0,max) to bits, -// one for each round. -// I refer to these as K[r] and F[r]. Thus, K[0] is the index used to -// compute swap operations four round 0, and F[0] is the series of bits -// used to determine whether a swap is performed, with F[0][0] being -// the swap decision for slot 0 in round 0. (Except it probably isn't, -// because the swap decision is actually made based on the highest index -// in a pair, to ensure that a swap between A and B always uses the same -// decision bit.) -// -// K values are generated using the SequencePermutationK range of offsets, -// with -// -// For F values, we set byte 8 of the plain text to 0x00, and use -// encoding/binary to dump the slot number into the first 8 bytes. This -// yields 128 values, which we treat as the values for the first 128 rounds, -// and then recycle for rounds 129+ if those exist. This is not very -// secure, but we're already at 1/2^128 chances by that time and don't care. -// We could probably trim rounds to 64 or so and not lose much data. - -// NewPermutation creates a Permutation which generates values in [0,m), -// from a given Sequence and seed value. -// -// The seed parameter selects different shuffles, and is useful if you need -// to generate multiple distinct shuffles from the same underlying sequence. -// Treat it as a secondary seed. -func NewPermutation(max int64, seed uint32, src Sequence) (*Permutation, error) { - if max < 1 { - return nil, errors.New("period must be positive") - } - // number of rounds to get "good" results is roughly 6 log N. - bits := 64 - bits.LeadingZeros64(uint64(max)) - p := Permutation{max: max, rounds: 6 * bits, counter: 0} - - p.src = src - p.k = make([]uint64, p.rounds) - p.permSeed = seed - offset := OffsetFor(SequencePermutationK, p.permSeed, 0, 0) - for i := uint64(0); i < uint64(p.rounds); i++ { - offset.Lo = i - p.k[i] = p.src.BitsAt(offset).Lo % uint64(p.max) - } - return &p, nil -} - -// Next generates the next value from the permutation. -func (p *Permutation) Next() (ret int64) { - return p.nextValue() -} - -// Nth generates the Nth value from the permutation. For instance, -// given a new permutation, calling Next once produces the same -// value you'd get from calling Nth(0). This is a seek which changes -// the offset that Next will count from; after calling Nth(x), you -// would get the same result from Next() that you would from Nth(x+1). -func (p *Permutation) Nth(n int64) (ret int64) { - p.counter = n - ret = p.nextValue() - return ret -} - -func (p *Permutation) nextValue() int64 { - p.counter = int64(uint64(p.counter) % uint64(p.max)) - x := uint64(p.counter) - p.counter++ - // a value which can't possibly be the next value we need, so we - // always hash on the first pass. - prev := uint64(p.max) + 1 - offset := OffsetFor(SequencePermutationF, p.permSeed, 0, 0) - for i := uint64(0); i < uint64(p.rounds); i++ { - if i > 0 && i&127 == 0 { - offset.Hi++ - // force regeneration of bits down below - prev = uint64(p.max) + 1 - } - xPrime := (p.k[i] + uint64(p.max) - x) % uint64(p.max) - xCaret := x - if xPrime > xCaret { - xCaret = xPrime - } - if xCaret != prev { - offset.Lo = xCaret - p.bits = p.src.BitsAt(offset) - prev = xCaret - } - if p.bits.Bit(i) != 0 { - x = xPrime - } - } - return int64(x) -} diff --git a/apophenia/permute_test.go b/apophenia/permute_test.go deleted file mode 100644 index 71e1f59..0000000 --- a/apophenia/permute_test.go +++ /dev/null @@ -1,128 +0,0 @@ -package apophenia - -import ( - "fmt" - "testing" -) - -func PermutationOrBust(period int64, seed int64, expectedErr string, tb testing.TB) *Permutation { - seq := NewSequence(seed) - p, err := NewPermutation(period, 0, seq) - if err != nil { - if expectedErr == "" || expectedErr != err.Error() { - tb.Fatalf("unexpected error creating permutation generator: %s", err) - } - return p - } - if p == nil { - tb.Fatalf("unexpected nil permutation generator without error") - } - return p -} - -func Test_PermuteCycle(t *testing.T) { - sizes := []int64{8, 23, 64, 10000} - for _, size := range sizes { - p := PermutationOrBust(size, 0, "", t) - seen := make(map[int64]struct{}, size) - for i := int64(0); i < size; i++ { - n := p.Next() - if _, ok := seen[n]; ok { - list := make([]int64, len(seen)) - j := 0 - for k := range seen { - list[j] = k - j++ - } - t.Fatalf("size %d: got duplicate entry %d in %v", size, n, list) - } - seen[n] = struct{}{} - } - } -} - -func TestPermuteSeed(t *testing.T) { - size := int64(129) - seeds := int64(8) - p := make([]*Permutation, seeds) - seen := make([]map[int64]struct{}, seeds) - for s := int64(0); s < seeds; s++ { - p[s] = PermutationOrBust(size, s, "", t) - seen[s] = make(map[int64]struct{}, size) - } - matches := int64(0) - v := make([]int64, seeds) - for i := int64(0); i < size; i++ { - for s := int64(0); s < seeds; s++ { - v[s] = p[s].Next() - if _, ok := seen[s][v[s]]; ok { - t.Fatalf("duplicate entry (size %d, seed %d, entry %d): %d", - size, s, i, v[s]) - } - seen[s][v[s]] = struct{}{} - } - for s := int64(1); s < seeds; s++ { - if v[s] == v[s-1] { - matches++ - } - } - } - // assuming number of outcomes is more than about 16, matches are pretty rare if nothing - // is wrong. - if (matches * 8) > (size * seeds) { - t.Fatalf("too many matches: %d values to permute, %d seeds, %d matches seems suspicious.", - size, seeds, matches) - } else { - t.Logf("permuting %d values across %d seeds: %d matches (OK)", size, seeds, matches) - } -} - -func Test_PermuteNth(t *testing.T) { - size := int64(129) - seeds := int64(8) - p := make([][]*Permutation, seeds) - seen := make([]map[int64]struct{}, seeds) - for s := int64(0); s < seeds; s++ { - p[s] = make([]*Permutation, 2) - p[s][0] = PermutationOrBust(size, s, "", t) - p[s][1] = PermutationOrBust(size, s, "", t) - seen[s] = make(map[int64]struct{}, size) - } - matches := int64(0) - v := make([]int64, seeds) - for i := int64(0); i < size; i++ { - for s := int64(0); s < seeds; s++ { - v[s] = p[s][0].Next() - if _, ok := seen[s][v[s]]; ok { - t.Fatalf("duplicate entry (size %d, seed %d, entry %d): %d", - size, s, i, v[s]) - } - seen[s][v[s]] = struct{}{} - vN := p[s][1].Nth(i) - if vN != v[s] { - t.Fatalf("Nth entry didn't match Nth call to Next()) (size %d, seed %d, n %d): expected %d, got %d\n", - size, s, i, v[s], vN) - } - } - } - // assuming number of outcomes is more than about 16, matches are pretty rare if nothing - // is wrong. - if (matches * 8) > (size * seeds) { - t.Fatalf("too many matches: %d values to permute, %d seeds, %d matches seems suspicious.", - size, seeds, matches) - } else { - t.Logf("permuting %d values across %d seeds: %d matches (OK)", size, seeds, matches) - } -} - -func Benchmark_PermuteCycle(b *testing.B) { - sizes := []int64{5, 63, 1000000, (1 << 19)} - for _, size := range sizes { - b.Run(fmt.Sprintf("Pool%d", size), func(b *testing.B) { - p := PermutationOrBust(size, 0, "", b) - for i := 0; i < b.N; i++ { - _ = p.Next() - } - }) - } -} diff --git a/apophenia/weighted.go b/apophenia/weighted.go deleted file mode 100644 index 0218e9c..0000000 --- a/apophenia/weighted.go +++ /dev/null @@ -1,97 +0,0 @@ -package apophenia - -import ( - "errors" - "math/bits" -) - -// Weighted provides a generator which produces weighted bits -- bits with -// a specified probability of being set, as opposed to random values weighted -// a given way. Bit density is specified as N/M, with M a (positive) -// power of 2, and N an integer between 0 and M. -// -// The underlying generator can produce 2^128 128-bit values, but the -// iterative process requires log2(densityScale) 128-bit values from the -// source per 128-bit output, so the top 7 bits of the address are used -// as an iteration counter. Thus, Weighted.Bit can produce 2^128 distinct -// values, but if the offset provided to Weighted.Bits is over 2^121, there -// will be overlap between the source values used for those bits, and -// source values used (for different iterations) for other offsets. -type Weighted struct { - src Sequence - // internal result cache - lastValue Uint128 - lastOffset Uint128 - lastDensity, lastScale uint64 -} - -// NewWeighted yields a new Weighted using the given sequence as a source of -// seekable pseudo-random bits. -func NewWeighted(src Sequence) (*Weighted, error) { - if src == nil { - return nil, errors.New("new Weighted requires a non-nil source") - } - w := Weighted{src: src} - return &w, nil -} - -// Bit returns the single 0-or-1 bit at the specified offset. -func (w *Weighted) Bit(offset Uint128, density uint64, scale uint64) uint64 { - var bit uint64 - // In order to be able to cache/reuse values, we want to grab a whole - // set of 128 bits including a given offset, and use the same - // calculation for all of them. So we mask out the low-order 7 bits - // of offset, and use them separately. Meanwhile, Bits will - // always right-shift its column bits by 7, which reduces the - // space of possible results but means that it produces the same - // set of bits for any given batch... - offset.Lo, bit = offset.Lo&^127, offset.Lo&127 - if offset == w.lastOffset && density == w.lastDensity && scale == w.lastScale { - return w.lastValue.Bit(bit) - } - w.lastValue = w.Bits(offset, density, scale) - w.lastOffset, w.lastDensity, w.lastScale = offset, density, scale - return w.lastValue.Bit(bit) -} - -const weightedIterationMask = (^uint64(0)) >> 7 - -// Bits returns the 128-bit set of bits including offset. The column portion -// of offset is right-shifted by 7 to match the offset calculations in Bit(), -// above. Thus, you get the same values back for each sequence of 128 consecutive -// offsets. -func (w *Weighted) Bits(offset Uint128, density uint64, scale uint64) (out Uint128) { - // magic accommodation for choices made elsewhere. - offset.Lo >>= 7 - if density == scale { - out.Not() - return out - } - if density == 0 { - return out - } - lz := uint(bits.TrailingZeros64(density)) - density >>= lz - scale >>= lz - // generate the same results we would have without this hackery - offset.Hi += uint64(lz) - for scale > 1 { - next := w.src.BitsAt(offset) - if density&1 != 0 { - out.Or(next) - } else { - out.And(next) - } - density >>= 1 - scale >>= 1 - // iteration is stashed in the bottom 24-bits of an offset - offset.Hi++ - } - return out -} - -// NextBits returns the next batch of bits after the last one retrieved. -func (w *Weighted) NextBits(density, scale uint64) (out Uint128) { - w.lastOffset.Inc() - return w.Bits(w.lastOffset, density, scale) -} diff --git a/apophenia/weighted_test.go b/apophenia/weighted_test.go deleted file mode 100644 index bd61bed..0000000 --- a/apophenia/weighted_test.go +++ /dev/null @@ -1,27 +0,0 @@ -package apophenia - -import ( - "fmt" - "testing" -) - -func Benchmark_WeightedDistribution(b *testing.B) { - src := NewSequence(0) - w, err := NewWeighted(src) - if err != nil { - b.Fatalf("couldn't make weighted: %v", err) - } - scales := []uint64{3, 6, 12, 18, 24, 63} - for _, scale := range scales { - off := OffsetFor(SequenceWeighted, 0, 0, 0) - scaled := uint64(1 << scale) - b.Run(fmt.Sprintf("Scale%d", scale), func(b *testing.B) { - for i := 0; i < b.N; i++ { - w.Bits(off, 1, scaled) - w.Bits(off, scaled/2, scaled) - w.Bits(off, scaled-1, scaled) - } - }) - } - -} diff --git a/apophenia/zipf.go b/apophenia/zipf.go deleted file mode 100644 index 457ec06..0000000 --- a/apophenia/zipf.go +++ /dev/null @@ -1,113 +0,0 @@ -package apophenia - -import ( - "fmt" - "math" -) - -// Zipf produces a series of values following a Zipf distribution. -// It is initialized with values q, v, and max, and produces values -// in the range [0,max) such that the probability of a value k is -// proportional to (v+k) ** -q. v must be >= 1, q must be > 1. -// -// This is based on the same paper used for the golang stdlib Zipf -// distribution: -// -// "Rejection-Inversion to Generate Variates -// from Monotone Discrete Distributions" -// W.Hormann, G.Derflinger [1996] -// http://eeyore.wu-wien.ac.at/papers/96-04-04.wh-der.ps.gz -// -// This implementation differs from stdlib's in that it is seekable; you -// can get the Nth value in a theoretical series of results in constant -// time, without having to generate the whole series linearly. -type Zipf struct { - src Sequence - seed uint32 - q float64 - v float64 - max float64 - oneMinusQ float64 - oneOverOneMinusQ float64 - h func(float64) float64 - hInv func(float64) float64 - hImaxOneHalf float64 - hX0MinusHImaxOneHalf float64 // hX0 is only ever used as hX0 - h(i[max] + 1/2) - s float64 - idx uint64 -} - -// NewZipf returns a new Zipf object with the specified q, v, and -// max, and with its random source seeded in some way by seed. -// The sequence of values returned is consistent for a given set -// of inputs. The seed parameter can select one of multiple sub-sequences -// of the given sequence. -func NewZipf(q float64, v float64, max uint64, seed uint32, src Sequence) (z *Zipf, err error) { - if math.IsNaN(q) || math.IsNaN(v) { - return nil, fmt.Errorf("q (%g) and v (%g) must not be NaN for Zipf distribution", q, v) - } - if q <= 1 || v < 1 { - return nil, fmt.Errorf("need q > 1 (got %g) and v >= 1 (got %g) for Zipf distribution", q, v) - } - if src == nil { - return nil, fmt.Errorf("need a usable PRNG apophenia.Sequence") - } - oneMinusQ := 1 - q - oneOverOneMinusQ := 1 / (1 - q) - z = &Zipf{ - q: q, - v: v, - max: float64(max), - seed: seed, - oneMinusQ: oneMinusQ, - oneOverOneMinusQ: oneOverOneMinusQ, - } - z.h = func(x float64) float64 { - return math.Exp((1-q)*math.Log(v+x)) * oneOverOneMinusQ - } - z.hInv = func(x float64) float64 { - return -v + math.Exp(oneOverOneMinusQ*math.Log(oneMinusQ*x)) - } - hX0 := z.h(0.5) - math.Exp(math.Log(v)*-q) - z.hImaxOneHalf = z.h(z.max + 0.5) - z.hX0MinusHImaxOneHalf = hX0 - z.hImaxOneHalf - z.s = 1 - z.hInv(z.h(1.5)-math.Exp(math.Log(v+1)*-q)) - z.src = src - if err != nil { - return nil, err - } - return z, nil -} - -// Nth returns the Nth value from the sequence associated with the -// given Zipf. For a given set of input values (q, v, max, and seed), -// and a given index, the same value is returned. -func (z *Zipf) Nth(index uint64) uint64 { - z.idx = index - offset := OffsetFor(SequenceZipfU, z.seed, 0, index) - for { - bits := z.src.BitsAt(offset) - uInt := bits.Lo - u := float64(uInt&(1<<53-1)) / (1 << 53) - u = z.hImaxOneHalf + u*z.hX0MinusHImaxOneHalf - x := z.hInv(u) - k := math.Floor(x + 0.5) - if k-x <= z.s { - return uint64(k) - } - if u >= z.h(k+0.5)-math.Exp(-math.Log(z.v+k)*z.q) { - return uint64(k) - } - // the low-order 24 bits of the high-order 64-bit word - // are the "iteration", which started as zero. Assuming we - // don't need more than ~16.7M values, we're good. The expected - // average is about 1.1. - offset.Hi++ - } -} - -// Next returns the "next" value -- the one after the last one requested, or -// value 1 if none have been requested before. -func (z *Zipf) Next() uint64 { - return z.Nth(z.idx + 1) -} diff --git a/apophenia/zipf_test.go b/apophenia/zipf_test.go deleted file mode 100644 index fab6b1b..0000000 --- a/apophenia/zipf_test.go +++ /dev/null @@ -1,107 +0,0 @@ -package apophenia - -import ( - "fmt" - "math" - "math/rand" - "testing" -) - -type testCase struct { - name string - s, v float64 - m uint64 -} - -var testCases = []testCase{ - {s: 1.01, v: 1, m: 100}, - {s: 2, v: 1, m: 100}, - {s: 1.01, v: 100, m: 1000}, - {s: 2, v: 10000, m: 1000}, -} - -func (tc testCase) Name() string { - if tc.name != "" { - return tc.name - } - return fmt.Sprintf("(zipf:s%f,v%f,m%d)", tc.s, tc.v, tc.m) -} - -func runZipf(zf func() uint64, values []uint64, n uint64, t *testing.T) { - for i := uint64(0); i < n; i++ { - x := zf() - if x < 0 || x >= uint64(len(values)) { - t.Fatalf("got out-of-range value %d from zipf function", x) - } - values[x]++ - } -} - -type zipfTestCase struct { - q, v float64 - seq Sequence - exp string -} - -func (z zipfTestCase) String() string { - return fmt.Sprintf("q: %g, v: %g, seq: %t, expected error: %t", - z.q, z.v, z.seq != nil, z.exp != "") -} - -func Test_InvalidInputs(t *testing.T) { - seq := NewSequence(0) - testCases := []zipfTestCase{ - {q: 1, v: 1.1, seq: seq, exp: "need q > 1 (got 1) and v >= 1 (got 1.1) for Zipf distribution"}, - {q: 1.1, v: 0.99, seq: seq, exp: "need q > 1 (got 1.1) and v >= 1 (got 0.99) for Zipf distribution"}, - {q: 1.1, v: 1.1, seq: nil, exp: "need a usable PRNG apophenia.Sequence"}, - {q: math.NaN(), v: 1.1, seq: nil, exp: "q (NaN) and v (1.1) must not be NaN for Zipf distribution"}, - {q: 1.01, v: 2, seq: seq, exp: ""}, - } - for _, c := range testCases { - z, err := NewZipf(c.q, c.v, 20, 0, c.seq) - if c.exp != "" { - if err == nil { - t.Errorf("case %v: expected error '%s', got no error", c, c.exp) - } else if err.Error() != c.exp { - t.Errorf("case %v: expected error '%s', got error '%s'", c, c.exp, err.Error()) - } - } else { - if err != nil { - t.Errorf("case %v: unexpected error %v", c, err) - } else if z == nil { - t.Errorf("case %v: nil Zipf despite no error", c) - } - } - } -} - -const runs = 1000000 - -func Test_CompareWithMath(t *testing.T) { - failed := false - for idx, c := range testCases { - stdlibValues := make([]uint64, c.m+1) - zipfValues := make([]uint64, c.m+1) - stdlibZipf := rand.NewZipf(rand.New(rand.NewSource(int64(idx))), c.s, c.v, c.m) - seq := NewSequence(int64(idx)) - zipfZipf, err := NewZipf(c.s, c.v, c.m, 0, seq) - if err != nil { - t.Fatalf("failed to create newZipf: %s", err) - } - runZipf(stdlibZipf.Uint64, stdlibValues, runs, t) - runZipf(zipfZipf.Next, zipfValues, runs, t) - for i := uint64(0); i < c.m; i++ { - stdlibP := float64(stdlibValues[i]) / runs - zipfP := float64(zipfValues[i]) / runs - diff := math.Abs(stdlibP - zipfP) - if diff > 0.001 { - failed = true - t.Logf("%s: stdlib %d, zipf %d, diff %f [s %f, v %f]", - c.Name(), stdlibValues[i], zipfValues[i], diff, c.s, c.v) - } - } - } - if failed { - t.Fail() - } -} diff --git a/bench/zipf.go b/bench/zipf.go index 7ed871e..7a7b710 100644 --- a/bench/zipf.go +++ b/bench/zipf.go @@ -9,8 +9,8 @@ import ( "os" "time" + "github.com/molecula/apophenia" "github.com/pilosa/go-pilosa" - "github.com/pilosa/tools/apophenia" ) // ZipfBenchmark sets random bits according to the Zipf-Mandelbrot distribution. diff --git a/go.mod b/go.mod index 09af251..e98ee72 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ go 1.12 require ( cloud.google.com/go v0.43.0 // indirect github.com/BurntSushi/toml v0.3.1 + github.com/alvaroloes/enumer v1.1.2 // indirect github.com/coreos/go-systemd v0.0.0-20190719114852-fd7a80b32e1f // indirect github.com/go-kit/kit v0.9.0 // indirect github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6 // indirect @@ -14,7 +15,9 @@ require ( github.com/jaffee/commandeer v0.1.0 github.com/kr/pty v1.1.8 // indirect github.com/miekg/dns v1.1.15 // indirect + github.com/molecula/apophenia v0.0.0-20190827192002-68b7a14a478b github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect + github.com/pascaldekloe/name v1.0.0 // indirect github.com/pilosa/go-pilosa v1.3.1-0.20190715210601-8606626b90d6 github.com/pilosa/pilosa v1.3.1 github.com/pkg/errors v0.8.1 @@ -31,7 +34,7 @@ require ( golang.org/x/net v0.0.0-20190628185345-da137c7871d7 // indirect golang.org/x/sync v0.0.0-20190423024810-112230192c58 golang.org/x/sys v0.0.0-20190712062909-fae7ac547cb7 // indirect - golang.org/x/tools v0.0.0-20190719005602-e377ae9d6386 // indirect + golang.org/x/tools v0.0.0-20191121172411-96ad48e4b060 // indirect google.golang.org/grpc v1.22.0 // indirect ) diff --git a/go.sum b/go.sum index d5de1de..da841af 100644 --- a/go.sum +++ b/go.sum @@ -20,6 +20,8 @@ github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuy github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= +github.com/alvaroloes/enumer v1.1.2 h1:5khqHB33TZy1GWCO/lZwcroBFh7u+0j40T83VUbfAMY= +github.com/alvaroloes/enumer v1.1.2/go.mod h1:FxrjvuXoDAx9isTJrv4c+T410zFi0DtXIT0m65DJ+Wo= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY= github.com/armon/go-metrics v0.0.0-20190430140413-ec5e00d3c878 h1:EFSB7Zo9Eg91v7MJPVsifUysc/wPdN+NOnVe6bWbdBM= @@ -185,6 +187,8 @@ github.com/mitchellh/mapstructure v1.1.2 h1:fmNYVwqnSfB9mZU6OS2O6GsXM+wcskZDuKQz github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/molecula/apophenia v0.0.0-20190827192002-68b7a14a478b h1:cZADDaNYM7xn/nklO3g198JerGQjadFuA0ofxBJgK0Y= +github.com/molecula/apophenia v0.0.0-20190827192002-68b7a14a478b/go.mod h1:uXd1BiH7xLmgkhVmspdJLENv6uGWrTL/MQX2TN7Yz9s= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= @@ -194,6 +198,10 @@ github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFSt github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY= github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= +github.com/pascaldekloe/name v0.0.0-20180628100202-0fd16699aae1 h1:/I3lTljEEDNYLho3/FUB7iD/oc2cEFgVmbHzV+O0PtU= +github.com/pascaldekloe/name v0.0.0-20180628100202-0fd16699aae1/go.mod h1:eD5JxqMiuNYyFNmyY9rkJ/slN8y59oEu4Ei7F8OoKWQ= +github.com/pascaldekloe/name v1.0.0 h1:n7LKFgHixETzxpRv2R77YgPUFo85QHGZKrdaYm7eY5U= +github.com/pascaldekloe/name v1.0.0/go.mod h1:Z//MfYJnH4jVpQ9wkclwu2I2MkHmXTlT9wR5UZScttM= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= github.com/pelletier/go-toml v1.4.0 h1:u3Z1r+oOXJIkxqw34zVhyPgjBsm6X2wn21NWs/HfSeg= github.com/pelletier/go-toml v1.4.0/go.mod h1:PN7xzY2wHTK0K9p34ErDQMlFxa51Fk0OUruD3k1mMwo= @@ -410,11 +418,16 @@ golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3 golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190524210228-3d17549cdc6b/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190530171427-2b03ca6e44eb/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190618163018-fdf1049a943a/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190719005602-e377ae9d6386 h1:W/t3IYUOQPd8DK2ssOWA8sjulHHMxzTgiQkSx0z5sRQ= golang.org/x/tools v0.0.0-20190719005602-e377ae9d6386/go.mod h1:jcCCGcm9btYwXyDqrUWc6MKQKKGJCWEQ3AfLSRIbEuI= +golang.org/x/tools v0.0.0-20191121172411-96ad48e4b060 h1:8y0nnVoyek4Pq5wCSqEDfFTSD8ODhH8zjM2/a5et2UI= +golang.org/x/tools v0.0.0-20191121172411-96ad48e4b060/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= google.golang.org/api v0.6.0/go.mod h1:btoxGiFvQNVUZQ8W08zLtrVS08CNpINPEfxXxgJL1Q4= google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= diff --git a/imagine/enums_densitytype.go b/imagine/enums_densitytype.go index ac6d32d..ca121a1 100644 --- a/imagine/enums_densitytype.go +++ b/imagine/enums_densitytype.go @@ -7,9 +7,9 @@ import ( "fmt" ) -const _densityTypeName = "linearzipf" +const _densityTypeName = "linearzipfpattern" -var _densityTypeIndex = [...]uint8{0, 6, 10} +var _densityTypeIndex = [...]uint8{0, 6, 10, 17} func (i densityType) String() string { if i < 0 || i >= densityType(len(_densityTypeIndex)-1) { @@ -18,11 +18,12 @@ func (i densityType) String() string { return _densityTypeName[_densityTypeIndex[i]:_densityTypeIndex[i+1]] } -var _densityTypeValues = []densityType{0, 1} +var _densityTypeValues = []densityType{0, 1, 2} var _densityTypeNameToValueMap = map[string]densityType{ - _densityTypeName[0:6]: 0, - _densityTypeName[6:10]: 1, + _densityTypeName[0:6]: 0, + _densityTypeName[6:10]: 1, + _densityTypeName[10:17]: 2, } // densityTypeString retrieves an enum value from the enum constants string name. diff --git a/imagine/generators.go b/imagine/generators.go index ceb3393..ff91ab6 100644 --- a/imagine/generators.go +++ b/imagine/generators.go @@ -5,6 +5,7 @@ import ( "fmt" "io" "math" + "math/bits" "math/rand" "sync" "time" @@ -82,6 +83,9 @@ func newSetGenerator(ts *taskSpec, updateChan chan taskUpdate, updateID string) if fs.FastSparse { return newFastValueGenerator(fs), nil } + if fs.ValueRule == densityTypePattern { + return fs.patternGen.BitGenerator(fs.Subpattern, ts, updateChan, updateID) + } // even though this is a set generator, we will treat it like a mutex generator -- we generate a series // of individual values rather than populating every field if ts.ColumnOrder == valueOrderZipf { @@ -158,6 +162,10 @@ func (g *singleValueGenerator) prepareSingleValueGenerator(ts *taskSpec, updateC // that computes a single value for a column, then returns it as a // pilosa.Column. func newMutexGenerator(ts *taskSpec, updateChan chan taskUpdate, updateID string) (iter CountingIterator, err error) { + fs := ts.FieldSpec + if fs.ValueRule == densityTypePattern { + return fs.patternGen.DigitGenerator(fs.Subpattern, ts, updateChan, updateID) + } g := columnValueGenerator{} err = g.prepareSingleValueGenerator(ts, updateChan, updateID) if err != nil { @@ -170,6 +178,10 @@ func newMutexGenerator(ts *taskSpec, updateChan chan taskUpdate, updateID string // that computes a single value for a column, then returns it as a // pilosa.FieldValue. func newIntGenerator(ts *taskSpec, updateChan chan taskUpdate, updateID string) (iter CountingIterator, err error) { + fs := ts.FieldSpec + if fs.ValueRule == densityTypePattern { + return fs.patternGen.DigitGenerator(fs.Subpattern, ts, updateChan, updateID) + } g := fieldValueGenerator{} err = g.prepareSingleValueGenerator(ts, updateChan, updateID) if err != nil { @@ -921,3 +933,404 @@ func newZipf(a float64, n int) zipf { func (z zipf) F(x int) float64 { return 1.0 / (math.Pow(float64(x), z.a) * z.s) } + +// Pattern represents shared logic for generating related values in multiple +// fields. +type Pattern interface { + // Rows indicates how many rows this pattern will want to generate, if it has + // a preference. For instance, a generator that wants to generate ten related fields + // would return 10. + Rows() int64 + // BitGenerator yields a named generator for set-type fields. + BitGenerator(name string, ts *taskSpec, updateChan chan taskUpdate, updateID string) (CountingIterator, error) + // DigitGenerator yields a named generator for int-type fields. + DigitGenerator(name string, ts *taskSpec, updateChan chan taskUpdate, updateID string) (CountingIterator, error) + // Max reports maximum generated value/row for the given name and number of columns + Max(name string, columns uint64) (int64, error) +} + +var patternsByName = map[string]func(n, mod, seed int64) (Pattern, error){ + "triangle": newTrianglePattern, +} + +// trianglePattern represents a pattern of digits which repeat in a +// pattern that produces triangle numbers. For instance, with n=4, it would +// produce `0010120123`, for a total of 10 columns per loop. It also takes +// an exponent, which determines what power of n the iteration happens in. +// For instance, with n=10, exp=0, the values would be +// 0, 0, 1, 0, 1, 2 [...], 9, +// 10, 10, 11, 10, 11, 12, ..., 19, +// 20, 20, 21, ... +// With n=10, exp=1, instead you'd get: +// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, +// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, +// 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, +// 0, 1, 2 [...] +// Either way, the 550th value would be 99. +// +// For n=10, this implies 55 consecutive columns, with digits: +// 0010120123012340123450123456012345670123456780123456789 +// +// We then have corresponding bitmasks: +// val[0] 1101001000100001000001000000100000001000000001000000000 [10 bits, 1 value] +// val[1] 0010100100010000100000100000010000000100000000100000000 [9 bits, 1 value] +// [...] +// val[9] 0000000000000000000000000000000000000000000000000000001 [1 bit, 1 value] +// +// once[0] 1000000000000000000000000000000000000000000000000000000 [1 bit, 1 value] +// once[1] 1010000000000000000000000000000000000000000000000000000 [2 bits, 2 values] +// [...] +// once[9] 1010010001000010000010000001000000010000000010000000001 [10 bits, 10 values] +// +// upto[0] 1101001000100001000001000000100000001000000001000000000 [10 bits, 1 value] +// upto[1] 1110001100110001100001100000110000001100000001100000000 [19 bits, 2 values] +// upto[3] 1111001110111001110001110000111000001110000001110000000 [27 bits, 3 values] +// [...] +// upto[8] 1111111111111111111111111111111111111111111111111111110 [54 bits, 9 values] +// upto[9] 1111111111111111111111111111111111111111111111111111111 [55 bits, 10 values] +// +// over[9] 0000000000000000000000000000000000000000000000000000001 [1 bit, 1 value] +// over[8] 0000000000000000000000000000000000000000000010000000011 [3 bits, 2 values] +// over[7] 0000000000000000000000000000000000010000000110000000111 [6 bits, 3 values] +// [...] +// over[1] 0010110111011110111110111111011111110111111110111111111 [45 bits, 9 values] +// over[0] 1111111111111111111111111111111111111111111111111111111 [55 bits, 10 values] +type trianglePattern struct { + n int64 + exp int64 + repeat uint64 // repetition interval for bits + digits []uint64 + equal bitSetSlice + once bitSetSlice + upto bitSetSlice + over bitSetSlice +} + +type bitSet []uint64 +type bitSetSlice []bitSet + +func (b bitSet) set(bit uint64) { + b[bit>>6] |= (1 << (bit & 63)) +} + +func (b bitSet) clear(bit uint64) { + b[bit>>6] &^= (1 << (bit & 63)) +} + +// bit(n) is 0 or 1 +func (b bitSet) bit(bit uint64) uint64 { + return (b[bit>>6] >> (bit & 63)) & 1 +} + +// add unions in the bits from another bitSet +func (b bitSet) add(other bitSet) { + for i := range b { + b[i] |= other[i] + } +} + +func (b bitSet) count() int { + var total int + for _, v := range b { + total += bits.OnesCount64(v) + } + return total +} + +// only useful for debugging, I suppose +func (s bitSetSlice) print() { + for row, set := range s { + count := set.count() + fmt.Printf(" %2d: ", row) + for i := len(set) - 1; i >= 0; i-- { + fmt.Printf("%#x", set[i]) + } + fmt.Printf(" [%d]\n", count) + } +} + +// biterator is an iterator which can yield successive column/row pairs using a +// given pattern of bits. +type biterator struct { + vals bitSetSlice + valsPerRow uint64 + startColumn uint64 + column uint64 + row uint64 + maxColumn uint64 + generated int64 + tries int64 + repeat uint64 // how many times to repeat + updateChan chan taskUpdate + updateID string +} + +func newBiterator(target bitSetSlice, updateChan chan taskUpdate, updateID string, valsPerRow int, repeat uint64, startColumn uint64, maxColumn uint64) *biterator { + return &biterator{vals: target, valsPerRow: uint64(valsPerRow), startColumn: startColumn, column: startColumn, maxColumn: maxColumn, updateChan: updateChan, updateID: updateID, repeat: repeat} +} + +func (b *biterator) Values() (int64, int64) { + return b.generated, b.tries +} + +func (b *biterator) next() { + b.column++ + if b.column >= b.maxColumn { + b.column = b.startColumn + b.row++ + } + if b.updateChan != nil && (b.tries%updatePeriod) == 0 { + b.updateChan <- taskUpdate{id: b.updateID, colCount: int64(b.column), rowCount: int64(b.row), done: b.row >= uint64(len(b.vals))} + } +} + +func (b *biterator) NextRecord() (ret pilosa.Record, err error) { + // fail out without trying to bump the column/row count + if b.row >= uint64(len(b.vals)) { + return nil, io.EOF + } + // We'll return as soon as we find a valid item, so we want to jump + // to one past it. if we don't find a valid item, this will gratuitously + // increment b.column once, which should be harmless. + defer b.next() + for b.row < uint64(len(b.vals)) { + b.tries++ + if b.vals[b.row].bit((b.column/b.repeat)%b.valsPerRow) == 1 { + b.generated++ + return pilosa.Column{ColumnID: b.column, RowID: b.row}, nil + } + b.next() + } + return nil, io.EOF +} + +// digiterator is an iterator which yields values which have some "digit" +// iterating in a sequence. Actually it's a base-n digit; if n==3, for instance, +// the values computed will be multiples of 3 plus {0,1,2} in the digit +// order... I can't explain this but it's cool. +type digiterator struct { + digits []uint64 // not a bitSet, a series of digit values + n uint64 + repeat uint64 + cycle uint64 + column uint64 + startColumn uint64 + maxColumn uint64 + counter uint64 + updateChan chan taskUpdate + updateID string +} + +func (d *digiterator) next() { + d.column++ + if d.column%d.cycle == 0 { + d.counter++ + } + if d.updateChan != nil && ((d.column-d.startColumn)%updatePeriod) == 0 { + d.updateChan <- taskUpdate{id: d.updateID, colCount: int64(d.column - d.startColumn), rowCount: 0, done: d.column >= d.maxColumn} + } +} + +// NextRecord returns the next value from a digiterator, which is magic. +// +// This is all done is base3 so you can see the magic in only a few screens of +// comment. The idea is that if we have a pattern with n=3 and exp=0, we count +// digits in the 3^0 place, and if it's exp=1, we count digits in the 3^1 place. +// So we iterate through the column%repeat values below the inserted digit, and +// when the digit wraps, we bump the counter, which goes above the inserted +// digit. for exp=0, repeat=1, and the sub-counter is always 0. +// +// exp=0 counter digit result +// 0 0 0 +// 0 0 0 +// 0 1 1 +// 0 0 0 +// 0 1 1 +// 0 2 2 +// 1 0 10 +// +// exp=1 counter digit under result +// 0 0 0 +// 0 1 1 +// 0 2 2 +// 0 0 0 +// 0 1 1 +// 0 2 2 +// 1 0 10 +// 1 1 11 +// 1 2 12 +// 0 0 0 +// 0 1 1 +// 0 2 2 +// 1 0 10 +// 1 1 11 +// 1 2 12 +// 2 0 20 +// 2 1 21 +// 2 2 22 +// 1 0 0 100 +func (d *digiterator) NextRecord() (pilosa.Record, error) { + if d.column >= d.maxColumn { + return nil, io.EOF + } + defer d.next() + under := d.column % d.repeat + digit := d.digits[(d.column/d.repeat)%uint64(len(d.digits))] + val := (d.counter * d.repeat * d.n) + (digit * d.repeat) + under + return pilosa.FieldValue{ColumnID: d.column, Value: int64(val)}, nil +} + +func (d *digiterator) Values() (int64, int64) { + return int64(d.column - d.startColumn), int64(d.column - d.startColumn) +} + +func newDigiterator(updateChan chan taskUpdate, updateID string, target []uint64, n int64, repeat uint64, startColumn uint64, maxColumn uint64) *digiterator { + cycle := uint64(len(target)) * uint64(repeat) + return &digiterator{ + updateChan: updateChan, + updateID: updateID, + digits: target, + startColumn: startColumn, + column: startColumn, + maxColumn: maxColumn, + repeat: repeat, + n: uint64(n), + cycle: cycle, + counter: startColumn / cycle, + } +} + +// newTrianglePattern creates a new triangle pattern, treating the n value as +// the number of distinct digit values to cycle, and the mod value as an exponent. +// Triangle patterns ignore their seed. +func newTrianglePattern(n int64, mod, _ int64) (Pattern, error) { + if n <= 0 { + return nil, fmt.Errorf("triangle pattern requires positive n, %d is not positive", n) + } + if mod < 0 { + return nil, fmt.Errorf("triangle pattern requires non-negative mod, %d is negative", mod) + } + t := &trianglePattern{n: n, exp: mod, repeat: 1} + // things repeat n^exp times + for i := int64(0); i < t.exp; i++ { + t.repeat *= uint64(t.n) + } + // populate bitmasks for the rows + t.equal = make([]bitSet, t.n) + t.once = make([]bitSet, t.n) + t.upto = make([]bitSet, t.n) + t.over = make([]bitSet, t.n) + // Triangle number n+[n-1]+...0 turns out to be (n^2 + n) / 2. This + // is how many bits, total, we need per row. + bitLen := (t.n * (t.n + 1)) / 2 + wordLen := (bitLen + 63) / 64 + // The actual set of last digits. + t.digits = make([]uint64, 0, bitLen) + for max := uint64(0); max < uint64(t.n); max++ { + for i := uint64(0); i <= max; i++ { + t.digits = append(t.digits, i) + } + } + // words for the four bitmasks, for a given row + wordsPerRow := wordLen * 4 + words := make([]uint64, t.n*wordsPerRow) + for row := int64(0); row < t.n; row++ { + rowWords := words[wordsPerRow*row : wordsPerRow*(row+1)] + t.equal[row] = rowWords[wordLen*0 : wordLen*1] + t.once[row] = rowWords[wordLen*1 : wordLen*2] + t.upto[row] = rowWords[wordLen*2 : wordLen*3] + t.over[row] = rowWords[wordLen*3 : wordLen*4] + didOnce := false + // compute the val/once rows directly + for i := range t.digits { + if row == int64(t.digits[i]) { + t.equal[row].set(uint64(i)) + if !didOnce { + t.once[row].set(uint64(i)) + didOnce = true + } + } + } + // compute upto, which is the previous upto plus the current + // val. + if row > 0 { + t.upto[row].add(t.upto[row-1]) + } + t.upto[row].add(t.equal[row]) + // over includes the current value; we can't do it the other + // way until we're done with these, though. + t.over[row].add(t.equal[row]) + } + // count backwards to populate over + for row := t.n - 2; row >= 0; row-- { + t.over[row].add(t.over[row+1]) + } + return t, nil +} + +func (t *trianglePattern) Rows() int64 { + return t.n +} + +func (t *trianglePattern) display() { + fmt.Printf("equal:\n") + t.equal.print() + fmt.Printf("once:\n") + t.once.print() + fmt.Printf("upto:\n") + t.upto.print() + fmt.Printf("over:\n") + t.over.print() +} + +func (t *trianglePattern) Max(name string, columns uint64) (int64, error) { + switch name { + case "equal", "once", "upto", "over": + return t.n, nil + case "value": + // For every t.repeat values generated, we generate one digit + // from our string of digits. For every time we make it through + // that cycle, we produce a total of t.n distinct values from + // digits, times the t.repeat values generated for each of them. + digitCycleLength := t.repeat * uint64(len(t.digits)) + digitCycleValues := t.repeat * uint64(t.n) + // Round up to a whole digit cycle rather than trying to + // figure out exactly where in it we'd be. + cycles := (columns + digitCycleLength - 1) / digitCycleLength + // fmt.Printf("max for n %d, exp %d, cols %d: cycleLength %d, values %d, cycles %d, total max %d\n", t.n, t.exp, columns, digitCycleLength, digitCycleValues, cycles, cycles*digitCycleValues) + return int64(cycles * digitCycleValues), nil + default: + return 0, fmt.Errorf("unknown subpattern %q for triangle pattern", name) + } +} + +func (t *trianglePattern) BitGenerator(name string, ts *taskSpec, updateChan chan taskUpdate, updateID string) (CountingIterator, error) { + var target bitSetSlice + switch name { + case "equal": + target = t.equal + case "once": + target = t.once + case "upto": + target = t.upto + case "over": + target = t.over + case "value": + return nil, errors.New("sub-pattern value is only usable for int/mutex fields") + } + if target == nil { + return nil, fmt.Errorf("unknown sub-pattern %q", name) + } + return newBiterator(target, updateChan, updateID, len(t.digits), t.repeat, uint64(ts.ColumnOffset), uint64(ts.ColumnOffset)+*ts.Columns), nil +} + +func (t *trianglePattern) DigitGenerator(name string, ts *taskSpec, updateChan chan taskUpdate, updateID string) (CountingIterator, error) { + if name != "value" { + bitMaybe, err := t.BitGenerator(name, ts, updateChan, updateID) + if err == nil && bitMaybe != nil { + return nil, fmt.Errorf("sub-pattern %q only usable for set fields", name) + } + return nil, fmt.Errorf("unknown sub-pattern %q", name) + } + return newDigiterator(updateChan, updateID, t.digits, t.n, t.repeat, uint64(ts.ColumnOffset), uint64(ts.ColumnOffset)+*ts.Columns), nil +} diff --git a/imagine/generators_test.go b/imagine/generators_test.go index 6437bd1..5e73d3f 100644 --- a/imagine/generators_test.go +++ b/imagine/generators_test.go @@ -7,6 +7,7 @@ import ( "time" gopilosa "github.com/pilosa/go-pilosa" + pilosa "github.com/pilosa/go-pilosa" ) func testSequenceGenerator(s sequenceGenerator, min int64, max int64, total int64) error { @@ -249,3 +250,193 @@ func TestMutexGen(t *testing.T) { } } + +func testOnePatternBits(t *testing.T, p Pattern, name string, ts *taskSpec, expected []pilosa.Column) { + valGen, err := p.BitGenerator(name, ts, nil, "foo") + if err != nil { + t.Fatalf("error creating %s generator: %v", name, err) + } + rec, err := valGen.NextRecord() + count := 0 + for err == nil { + col, ok := rec.(pilosa.Column) + if !ok { + t.Fatalf("%s got unexpected record type %T", name, rec) + } + if count >= len(expected) { + t.Fatalf("extra result: %s got %#v after %d expected results", + name, col, count) + } + if col.ColumnID != expected[count].ColumnID || col.RowID != expected[count].RowID { + t.Fatalf("expected %s result %d to be %#v, got %#v", + name, count, expected[count], col) + } + count++ + rec, err = valGen.NextRecord() + } + if count < len(expected) { + t.Fatalf("%s early return, %d/%d values", name, count, len(expected)) + } + if err != io.EOF { + t.Fatalf("%s expected io.EOF, got %v", name, err) + } +} + +func testOnePatternValues(t *testing.T, p Pattern, name string, ts *taskSpec, expected []pilosa.FieldValue) { + valGen, err := p.DigitGenerator(name, ts, nil, "foo") + if err != nil { + t.Fatalf("error creating %s generator: %v", name, err) + } + rec, err := valGen.NextRecord() + count := 0 + for err == nil { + col, ok := rec.(pilosa.FieldValue) + if !ok { + t.Fatalf("%s got unexpected record type %T", name, rec) + } + if count >= len(expected) { + t.Fatalf("extra result: %s got %#v after %d expected results", + name, col, count) + } + if col.ColumnID != expected[count].ColumnID || col.Value != expected[count].Value { + t.Fatalf("expected %s result %d to be %#v, got %#v", + name, count, expected[count], col) + } + count++ + rec, err = valGen.NextRecord() + } + if count < len(expected) { + t.Fatalf("%s early return, %d/%d values", name, count, len(expected)) + } + if err != io.EOF { + t.Fatalf("%s expected io.EOF, got %v", name, err) + } +} + +type triangleSpecExpected struct { + n, exp, col int64 + expectedBits map[string][]pilosa.Column + expectedValues []pilosa.FieldValue +} + +func TestPatternGen(t *testing.T) { + testCases := []triangleSpecExpected{ + { + n: 3, + exp: 0, + col: 8, + expectedBits: map[string][]pilosa.Column{ + "equal": []pilosa.Column{ + {ColumnID: 0, RowID: 0}, + {ColumnID: 1, RowID: 0}, + {ColumnID: 3, RowID: 0}, + {ColumnID: 6, RowID: 0}, + {ColumnID: 7, RowID: 0}, + {ColumnID: 2, RowID: 1}, + {ColumnID: 4, RowID: 1}, + {ColumnID: 5, RowID: 2}, + }, + "once": []pilosa.Column{ + {ColumnID: 0, RowID: 0}, + {ColumnID: 6, RowID: 0}, + {ColumnID: 2, RowID: 1}, + {ColumnID: 5, RowID: 2}, + }, + "upto": []pilosa.Column{ + {ColumnID: 0, RowID: 0}, + {ColumnID: 1, RowID: 0}, + {ColumnID: 3, RowID: 0}, + {ColumnID: 6, RowID: 0}, + {ColumnID: 7, RowID: 0}, + {ColumnID: 0, RowID: 1}, + {ColumnID: 1, RowID: 1}, + {ColumnID: 2, RowID: 1}, + {ColumnID: 3, RowID: 1}, + {ColumnID: 4, RowID: 1}, + {ColumnID: 6, RowID: 1}, + {ColumnID: 7, RowID: 1}, + {ColumnID: 0, RowID: 2}, + {ColumnID: 1, RowID: 2}, + {ColumnID: 2, RowID: 2}, + {ColumnID: 3, RowID: 2}, + {ColumnID: 4, RowID: 2}, + {ColumnID: 5, RowID: 2}, + {ColumnID: 6, RowID: 2}, + {ColumnID: 7, RowID: 2}, + }, + "over": []pilosa.Column{ + {ColumnID: 0, RowID: 0}, + {ColumnID: 1, RowID: 0}, + {ColumnID: 2, RowID: 0}, + {ColumnID: 3, RowID: 0}, + {ColumnID: 4, RowID: 0}, + {ColumnID: 5, RowID: 0}, + {ColumnID: 6, RowID: 0}, + {ColumnID: 7, RowID: 0}, + {ColumnID: 2, RowID: 1}, + {ColumnID: 4, RowID: 1}, + {ColumnID: 5, RowID: 1}, + {ColumnID: 5, RowID: 2}, + }, + }, + expectedValues: []pilosa.FieldValue{ + {ColumnID: 0, Value: 0}, + {ColumnID: 1, Value: 0}, + {ColumnID: 2, Value: 1}, + {ColumnID: 3, Value: 0}, + {ColumnID: 4, Value: 1}, + {ColumnID: 5, Value: 2}, + {ColumnID: 6, Value: 3}, + {ColumnID: 7, Value: 3}, + }, + }, + { + n: 3, + exp: 1, + col: 16, + expectedBits: map[string][]pilosa.Column{ + "once": []pilosa.Column{ + {ColumnID: 0, RowID: 0}, + {ColumnID: 1, RowID: 0}, + {ColumnID: 2, RowID: 0}, + {ColumnID: 6, RowID: 1}, + {ColumnID: 7, RowID: 1}, + {ColumnID: 8, RowID: 1}, + {ColumnID: 15, RowID: 2}, + }, + }, + expectedValues: []pilosa.FieldValue{ + {ColumnID: 0, Value: 0}, + {ColumnID: 1, Value: 1}, + {ColumnID: 2, Value: 2}, + {ColumnID: 3, Value: 0}, + {ColumnID: 4, Value: 1}, + {ColumnID: 5, Value: 2}, + {ColumnID: 6, Value: 3}, + {ColumnID: 7, Value: 4}, + {ColumnID: 8, Value: 5}, + {ColumnID: 9, Value: 0}, + {ColumnID: 10, Value: 1}, + {ColumnID: 11, Value: 2}, + {ColumnID: 12, Value: 3}, + {ColumnID: 13, Value: 4}, + {ColumnID: 14, Value: 5}, + {ColumnID: 15, Value: 6}, + }, + }, + } + for _, testCase := range testCases { + p, err := newTrianglePattern(testCase.n, testCase.exp, 0) + if err != nil { + t.Fatalf("generating trianglePattern: %v", err) + } + colVal := uint64(testCase.col) + ts := taskSpec{Columns: &colVal} + for name, expected := range testCase.expectedBits { + testOnePatternBits(t, p, name, &ts, expected) + } + if testCase.expectedValues != nil { + testOnePatternValues(t, p, "value", &ts, testCase.expectedValues) + } + } +} diff --git a/imagine/spec.go b/imagine/spec.go index 7ef6697..088d7b5 100644 --- a/imagine/spec.go +++ b/imagine/spec.go @@ -9,7 +9,6 @@ package imagine //go:generate enumer -type=timeQuantum -trimprefix=timeQuantum -text -transform=caps -output enums_timequantum.go import ( - "errors" "fmt" "os" "strconv" @@ -17,6 +16,7 @@ import ( "time" "github.com/BurntSushi/toml" + "github.com/pkg/errors" ) type fieldType int @@ -34,6 +34,7 @@ type densityType int const ( densityTypeLinear densityType = iota densityTypeZipf + densityTypePattern ) type dimensionOrder int @@ -128,13 +129,21 @@ type indexSpec struct { Fields []*fieldSpec Seed *int64 // default PRNG seed ShardWidth uint64 + Pattern string // a global pattern to be used + PatternN int64 // input #1 to the patternn + PatternMod int64 // modifier value specific to pattern, e.g. exponent for triangle-type + patternGen Pattern // a pattern generated from the pattern spec } func (is *indexSpec) String() string { if is == nil { return "" } - return fmt.Sprintf("%s [%s], %d columns, %d fields", is.Name, is.FullName, is.Columns, len(is.Fields)) + pattern := "" + if is.Pattern != "" { + pattern = fmt.Sprintf(", pattern %s [%d:%d]", is.Pattern, is.PatternN, is.PatternMod) + } + return fmt.Sprintf("%s [%s], %d columns, %d fields%s", is.Name, is.FullName, is.Columns, len(is.Fields), pattern) } // fieldSpec describes a given field within an index. @@ -147,6 +156,7 @@ type fieldSpec struct { Type fieldType // "set", "mutex", "int" ZipfV, ZipfS float64 // the V/S parameters of a Zipf distribution ZipfA float64 // alpha parameter for a zipf distribution, should be >= 0 + Subpattern string // Which of a pattern's supported patterns to use. Relates to index-wide pattern. Min, Max int64 // Allowable value range for an int field. Row range for set/mutex fields. SourceIndex string // SourceIndex's columns are used as value range for this field. Density float64 // Base density to use in [0,1]. @@ -156,6 +166,10 @@ type fieldSpec struct { Next *fieldSpec `toml:"-"` // next fieldspec to try HighestColumn int64 `toml:"-"` // highest column we've generated for this field Quantum *timeQuantum // time quantum, useful only for time fields + Pattern *string // a global pattern to be used + PatternN *int64 // input #1 to the patternn + PatternMod *int64 // modifier value specific to pattern, e.g. exponent for triangle-type + patternGen Pattern // a pattern generated self (or parent's) pattern spec FastSparse bool CachePath string @@ -216,6 +230,9 @@ func (fs *fieldSpec) String() string { density = fmt.Sprintf("%.3f", fs.Density) case densityTypeZipf: density = fmt.Sprintf("%.3f base, Zipf v %.3f s %.3f", fs.Density, fs.ZipfV, fs.ZipfS) + case densityTypePattern: + pattern, patternN, patternMod := fs.Parent.Pattern, fs.Parent.PatternN, fs.Parent.PatternMod + density = fmt.Sprintf("pattern[%s][%s][%d:%d]", pattern, fs.Subpattern, patternN, patternMod) } switch fs.Type { case fieldTypeSet: @@ -399,6 +416,17 @@ func (is *indexSpec) Cleanup(conf *Config) error { if conf.ColumnScale != 0 { is.Columns *= uint64(conf.ColumnScale) } + if is.Pattern != "" { + patternGenFunc, ok := patternsByName[is.Pattern] + if !ok { + return fmt.Errorf("pattern name %q unrecognized", is.Pattern) + } + var err error + is.patternGen, err = patternGenFunc(is.PatternN, is.PatternMod, 0) + if err != nil { + return err + } + } for _, field := range is.Fields { field.Parent = is if is.FieldsByName[field.Name] != nil { @@ -476,6 +504,41 @@ func (fs *fieldSpec) Cleanup(conf *Config) error { } else { fixDensityScale(fs.DensityScale) } + if fs.Pattern == nil { + fs.Pattern = &fs.Parent.Pattern + } + if fs.PatternN == nil { + fs.PatternN = &fs.Parent.PatternN + } + if fs.PatternMod == nil { + fs.PatternMod = &fs.Parent.PatternMod + } + if fs.ValueRule == densityTypePattern { + if *fs.Pattern == "" { + return fmt.Errorf("field %s: no pattern specified", fs.Name) + } + patternGenFunc, ok := patternsByName[*fs.Pattern] + if !ok { + return fmt.Errorf("pattern name %q unrecognized", *fs.Pattern) + } + if fs.Subpattern == "" { + return fmt.Errorf("field %s: no subpattern specified for pattern value rule", fs.Name) + } + var err error + fs.patternGen, err = patternGenFunc(*fs.PatternN, *fs.PatternMod, 0) + if err != nil { + return err + } + // default to parent's column count and suitable max + if fs.Max == 0 { + fs.Max, err = fs.patternGen.Max(fs.Subpattern, fs.Parent.Columns) + fmt.Printf("%s inferred maximum value: %s:%s, n %d, columns %d, max %d\n", fs.Name, *fs.Pattern, fs.Subpattern, *fs.PatternN, fs.Parent.Columns, fs.Max) + if err != nil { + return errors.Wrap(err, fmt.Sprintf("trying to infer max from pattern %q:%q", *fs.Pattern, fs.Subpattern)) + } + } + } + fs.CachePath = fs.Parent.Parent.CachePath // no specified chance = 1.0 if fs.Chance == nil {