diff --git a/apophenia/README.md b/apophenia/README.md deleted file mode 100644 index 51ef4ec..0000000 --- a/apophenia/README.md +++ /dev/null @@ -1,92 +0,0 @@ -# Apophenia -- seeking patterns in randomness - -Apophenia provides an approximate emulation of a seekable pseudo-random -number generator. You provide a seed, and get a generator which can generate -a large number of pseudo-random bits which will occur in a predictable -pattern, but you can seek anywhere in that pattern in constant time. - -Apophenia's interface is intended to be similar to that of stdlib's -`math/rand`. In fact, the `Sequence` interface type is a strict superset -of `rand.Source`. - -## Implementation Notes - -AES-128, for a given seed, consists of a PRNG which, from 2^128 input -coordinates, produces 2^128 possible outputs, each of which is 128 bits -long. This is equivalent to a single-bit PRNG of period 2^135. It is -also possible to treat the input 128 bits as a coordinate system of -some sort, to allow multiple parallel sequences, etcetera. - -This design may have serious fundamental flaws, but it worked out in -light testing and I'm an optimist. - -### Sequences and Offsets - -Apophenia's underlying implementation admits 128-bit keys, and 128-bit -offsets within each sequence. In most cases: - - * That's more space than we need. - * Working with a non-native type for item numbers is annoying, - but 64-bits is enough range. - * It would be nice to avoid using the *same* pseudo-random values - for different things. - * Even when those things have the same basic identifying ID or - value. - -For instance, say you wish to generate a billion items. Each item should -have several "random" values. Some values might follow a Zipf distribution, -others might just be "U % N" for some N. If you use the item number as a -key, and seek to the same position for each of these, and get the same bits -for each of these, you may get unintended similarities or correlations between -them. - -With this in mind, apophenia divides its 128-bit offset space into a number -of spaces. The most significant bits are used for a sequence-type value, one -of: - - * SequenceDefault - * SequencePermutationK/SequencePermutationF: permutations - * SequenceWeighted: weighted bits - * SequenceLinear: linear values within a range - * SequenceZipfU: uniforms to use for Zipf values - * SequenceRandSource: default offsets for the rand.Source - * SequenceUser1/SequenceUser2: reserved for non-apophenia usage - -Other values are not yet defined, but are reserved. - -Within most of these spaces, the rest of the high word of the offset is used -for a 'seed' (used to select different sequences) and an 'iteration' (used -for successive values consumed by an algorithm). The low-order word is treated -as a 64-bit item ID. - -``` -High-order word: -0 1 2 3 -0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef -[iteration ][seed ][seq ] -Low-order word: -0 1 2 3 -0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef -[id ] -``` - -The convenience function `OffsetFor(sequence, seed, iteration, id)` -supports this usage. - -As a side effect, if generating additional values for a given seed and -id, you can increment the high-order word of the `Uint128`, -and if generating values for a new id, you can increment the low-order -word. If your algorithm consumes more than 2^24 values for a single -operation, you could start hitting values shared with other seeds. Oh, -well. - -#### Iteration usage - -For the built-in consumers: - - * Weighted consumes log2(scale) iterated values. - * Zipf consumes an *average* of no more than about 1.1 values. - * Permutation consumes one iterated value per 128 rounds of permutation, - where rounds is equal to `6*ceil(log2(max))`. (For instance, a second - value is consumed around a maximum of 2^22, and a third around 2^43.) - * Nothing else uses more than one iterated value. diff --git a/apophenia/apophenia.go b/apophenia/apophenia.go deleted file mode 100644 index 0749510..0000000 --- a/apophenia/apophenia.go +++ /dev/null @@ -1,117 +0,0 @@ -// Package apophenia provides seekable pseudo-random numbers, allowing -// reproducibility of pseudo-random results regardless of the order they're -// generated in. -package apophenia - -import ( - "crypto/aes" - "crypto/cipher" - "encoding/binary" - "math/rand" -) - -// Sequence represents a specific deterministic but pseudo-random-ish series -// of bits. A Sequence can be used as a `rand.Source` or `rand.Source64` -// for `math/rand`. -type Sequence interface { - rand.Source - Seek(Uint128) Uint128 - BitsAt(Uint128) Uint128 -} - -// aesSequence128 implements Sequence on top of an AES block cipher. -type aesSequence128 struct { - key [16]byte - cipher cipher.Block - plainText, cipherText [16]byte - offset Uint128 - err error -} - -// NewSequence generates a sequence initialized with the given seed. -func NewSequence(seed int64) Sequence { - s := aesSequence128{offset: OffsetFor(SequenceRandSource, 0, 0, 0)} - s.Seed(seed) - if s.err != nil { - panic("impossible error: " + s.err.Error()) - } - return &s -} - -// Seed sets the generator to a known state. -func (s *aesSequence128) Seed(seed int64) { - var newKey [16]byte - binary.LittleEndian.PutUint64(newKey[:8], uint64(seed)) - newCipher, err := aes.NewCipher(newKey[:]) - if err != nil { - // we can't return an error, because Seed() can't fail. also - // note that this can't actually happen, supposedly. - s.err = err - return - } - copy(s.key[:], newKey[:]) - s.cipher = newCipher - s.offset = Uint128{0, 0} -} - -// Int63 returns a value in 0..(1<<63)-1. -func (s *aesSequence128) Int63() int64 { - return int64(s.Uint64() >> 1) -} - -// Uint64 returns a value in 0..(1<<64)-1. -func (s *aesSequence128) Uint64() uint64 { - out := s.BitsAt(s.offset) - s.offset.Inc() - return out.Lo -} - -// SequenceClass denotes one of the sequence types, which are used to allow -// sequences to avoid hitting each other's pseudo-random results. -type SequenceClass uint8 - -const ( - // SequenceDefault is the zero value, used if you didn't think to pick one. - SequenceDefault SequenceClass = iota - // SequencePermutationK is the K values for the permutation algorithm. - SequencePermutationK - // SequencePermutationF is the F values for the permutation algorithm. - SequencePermutationF - // SequenceWeighted is used to generate weighted values for a given - // position. - SequenceWeighted - // SequenceLinear is the random numbers for U%N type usage. - SequenceLinear - // SequenceZipfU is the random numbers for the Zipf computations. - SequenceZipfU - // SequenceRandSource is used by default when a Sequence is being - // used as a rand.Source. - SequenceRandSource - // SequenceUser1 is eserved for non-apophenia package usage. - SequenceUser1 - // SequenceUser2 is reserved for non-apophenia package usage. - SequenceUser2 -) - -// OffsetFor determines the Uint128 offset for a given class/seed/iteration/id. -func OffsetFor(class SequenceClass, seed uint32, iter uint32, id uint64) Uint128 { - return Uint128{Hi: (uint64(class) << 56) | (uint64(seed) << 24) | uint64(iter), - Lo: id} -} - -// Seek seeks to the specified offset, yielding the previous offset. This -// sets the stream to a specific point in its cycle, affecting future calls -// to Int63 or Uint64. -func (s *aesSequence128) Seek(offset Uint128) (old Uint128) { - old, s.offset = s.offset, offset - return old -} - -// BitsAt yields the sequence of bits at the provided offset into the stream. -func (s *aesSequence128) BitsAt(offset Uint128) (out Uint128) { - binary.LittleEndian.PutUint64(s.plainText[:8], offset.Lo) - binary.LittleEndian.PutUint64(s.plainText[8:], offset.Hi) - s.cipher.Encrypt(s.cipherText[:], s.plainText[:]) - out.Lo, out.Hi = binary.LittleEndian.Uint64(s.cipherText[:8]), binary.LittleEndian.Uint64(s.cipherText[8:]) - return out -} diff --git a/apophenia/int128.go b/apophenia/int128.go deleted file mode 100644 index db3e146..0000000 --- a/apophenia/int128.go +++ /dev/null @@ -1,181 +0,0 @@ -package apophenia - -import "fmt" - -// Uint128 is an array of 2 uint64, treated as a single -// object to simplify calling conventions. -type Uint128 struct { - Lo, Hi uint64 // low-order and high-order uint64 words. Value is ``(Hi << 64) | Lo`. -} - -// Add adds value to its receiver in place. -func (u *Uint128) Add(value Uint128) { - u.Lo += value.Lo - if u.Lo < value.Lo { - u.Hi++ - } - u.Hi += value.Hi -} - -// Sub subtracts value from its receiver in place. -func (u *Uint128) Sub(value Uint128) { - u.Lo -= value.Lo - if u.Lo > value.Lo { - u.Hi-- - } - u.Hi -= value.Hi -} - -// And does a bitwise and with value, in place. -func (u *Uint128) And(value Uint128) { - u.Lo, u.Hi = u.Lo&value.Lo, u.Hi&value.Hi -} - -// Or does a bitwise or with value, in place. -func (u *Uint128) Or(value Uint128) { - u.Lo, u.Hi = u.Lo|value.Lo, u.Hi|value.Hi -} - -// Xor does a bitwise xor with value, in place. -func (u *Uint128) Xor(value Uint128) { - u.Lo, u.Hi = u.Lo^value.Lo, u.Hi^value.Hi -} - -// Not does a bitwise complement in place. -func (u *Uint128) Not() { - u.Lo, u.Hi = ^u.Lo, ^u.Hi -} - -// Mask produces a mask of the lower n bits of u. -func (u *Uint128) Mask(n uint64) { - if n >= 128 { - return - } - if n >= 64 { - u.Hi &= (1 << (n & 63)) - 1 - } else { - u.Lo &= (1 << n) - 1 - } -} - -// Mask produces a bitmask with n bits set. -func Mask(n uint64) (u Uint128) { - if n >= 128 { - u.Not() - return u - } - if n >= 64 { - u.Lo-- - u.Hi = (1 << n & 63) - 1 - return u - } - u.Lo = (1 << n) - 1 - return u -} - -// String provides a string representation. -func (u Uint128) String() string { - return fmt.Sprintf("0x%x%016x", u.Hi, u.Lo) -} - -// bit rotation: for 1-63 bits, we are moving the low-order N bits of u.Lo -// into the high-order N bits of u.Hi, and vice versa. For 64-127, it's that -// plus swapping u.Lo and u.Hi. - -// RotateRight rotates u right by n bits. -func (u *Uint128) RotateRight(n uint64) { - if n&64 != 0 { - u.Lo, u.Hi = u.Hi, u.Lo - } - n &= 63 - if n == 0 { - return - } - unbits := 64 - n - - u.Lo, u.Hi = (u.Lo>>n)|(u.Hi<>n)|(u.Lo<>unbits), (u.Hi<>unbits) -} - -// ShiftRight shifts u right by n bits. -func (u *Uint128) ShiftRight(n uint64) { - if n > 127 { - u.Lo, u.Hi = 0, 0 - return - } - if n >= 64 { - u.Lo, u.Hi = u.Hi>>(n&63), 0 - return - } - unbits := 64 - n - - u.Lo, u.Hi = (u.Lo>>n)|(u.Hi<> n) -} - -// ShiftRightCarry returns both the shifted value and the bits that -// were shifted out. Useful for when you want both x%N and x/N for -// N a power of 2. Only sane if bits <= 64. -func (u *Uint128) ShiftRightCarry(n uint64) (out Uint128, carry uint64) { - if n > 64 { - return out, carry - } - if n == 64 { - out.Lo, carry = u.Hi, u.Lo - return out, carry - } - unbits := 64 - n - - out.Lo, out.Hi, carry = (u.Lo>>n)|(u.Hi<> n), u.Lo&((1< 127 { - u.Lo, u.Hi = 0, 0 - return - } - if n >= 64 { - u.Lo, u.Hi = 0, u.Lo<<(n&63) - return - } - n &= 63 - if n == 0 { - return - } - unbits := 64 - n - - u.Lo, u.Hi = (u.Lo << n), (u.Hi<>unbits) -} - -// Bit returns 1 if the nth bit is set, 0 otherwise. -func (u *Uint128) Bit(n uint64) uint64 { - if n >= 128 { - return 0 - } - if n >= 64 { - return (u.Hi >> (n & 63)) & 1 - } - return (u.Lo >> n) & 1 -} - -// Inc increments its receiver in place. -func (u *Uint128) Inc() { - u.Lo++ - if u.Lo == 0 { - u.Hi++ - } -} diff --git a/apophenia/int128_test.go b/apophenia/int128_test.go deleted file mode 100644 index 411c4a9..0000000 --- a/apophenia/int128_test.go +++ /dev/null @@ -1,58 +0,0 @@ -package apophenia - -import ( - "testing" -) - -func Test_Int128Rotate(t *testing.T) { - cases := []struct { - in Uint128 - bits uint64 - outL, outR Uint128 - }{ - {in: Uint128{Lo: 0x1}, bits: 1, outR: Uint128{Lo: 0x0, Hi: 1 << 63}, outL: Uint128{Lo: 0x2, Hi: 0}}, - {in: Uint128{Lo: 0x11}, bits: 4, outR: Uint128{Lo: 1, Hi: 1 << 60}, outL: Uint128{Lo: 0x110, Hi: 0}}, - {in: Uint128{Lo: 0x11}, bits: 65, outR: Uint128{Lo: 1 << 63, Hi: 8}, outL: Uint128{Lo: 0, Hi: 0x22}}, - } - for _, c := range cases { - u := c.in - u.RotateRight(c.bits) - if u != c.outR { - t.Fatalf("rotate %s right by %d: expected %s, got %s", - c.in, c.bits, c.outR, u) - } - u = c.in - u.RotateLeft(c.bits) - if u != c.outL { - t.Fatalf("rotate %s left by %d: expected %s, got %s", - c.in, c.bits, c.outL, u) - } - } -} - -func Test_Int128Shift(t *testing.T) { - cases := []struct { - in Uint128 - bits uint64 - outL, outR Uint128 - }{ - {in: Uint128{Lo: 0x1}, bits: 1, outR: Uint128{Lo: 0x0, Hi: 0}, outL: Uint128{Lo: 0x2, Hi: 0}}, - {in: Uint128{Lo: 0x11}, bits: 4, outR: Uint128{Lo: 1, Hi: 0}, outL: Uint128{Lo: 0x110, Hi: 0}}, - {in: Uint128{Lo: 0x11, Hi: 0x3}, bits: 65, outR: Uint128{Lo: 1, Hi: 0}, outL: Uint128{Lo: 0, Hi: 0x22}}, - {in: Uint128{Lo: 0, Hi: 0x11}, bits: 68, outR: Uint128{Lo: 1, Hi: 0}, outL: Uint128{Lo: 0, Hi: 0}}, - } - for _, c := range cases { - u := c.in - u.ShiftRight(c.bits) - if u != c.outR { - t.Fatalf("shift %s right by %d: expected %s, got %s", - c.in, c.bits, c.outR, u) - } - u = c.in - u.ShiftLeft(c.bits) - if u != c.outL { - t.Fatalf("shift %s left by %d: expected %s, got %s", - c.in, c.bits, c.outL, u) - } - } -} diff --git a/apophenia/permute.go b/apophenia/permute.go deleted file mode 100644 index 9a78f8d..0000000 --- a/apophenia/permute.go +++ /dev/null @@ -1,127 +0,0 @@ -package apophenia - -import ( - "errors" - "math/bits" -) - -// PermutationGenerator provides a way to pass integer IDs through a permutation -// map that is pseudorandom but repeatable. This could be done with rand.Perm, -// but that would require storing a slice of [Items]int64, which we want to avoid -// for large values of Items. -// -// Not actually cryptographically secure. -type Permutation struct { - src Sequence - permSeed uint32 - max int64 - counter int64 - rounds int - bits Uint128 - k []uint64 -} - -// Design notes: -// -// This is based on: -// http://arxiv.org/abs/1208.1176v2 -// -// This simulates the results of a shuffle in a way allowing a lookup of -// the results of the shuffle for any given position, in time proportional -// to a number of "rounds", each of which is 50% likely to swap a slot -// with another slot. The number of rounds needed to achieve a reasonable -// probability of safety is log(N)*6 or so. -// -// Each permutation is fully defined by a "key", consisting of: -// 1. A key "KF" naming a value in [0,max) for each round. -// 2. A series of round functions mapping values in [0,max) to bits, -// one for each round. -// I refer to these as K[r] and F[r]. Thus, K[0] is the index used to -// compute swap operations four round 0, and F[0] is the series of bits -// used to determine whether a swap is performed, with F[0][0] being -// the swap decision for slot 0 in round 0. (Except it probably isn't, -// because the swap decision is actually made based on the highest index -// in a pair, to ensure that a swap between A and B always uses the same -// decision bit.) -// -// K values are generated using the SequencePermutationK range of offsets, -// with -// -// For F values, we set byte 8 of the plain text to 0x00, and use -// encoding/binary to dump the slot number into the first 8 bytes. This -// yields 128 values, which we treat as the values for the first 128 rounds, -// and then recycle for rounds 129+ if those exist. This is not very -// secure, but we're already at 1/2^128 chances by that time and don't care. -// We could probably trim rounds to 64 or so and not lose much data. - -// NewPermutation creates a Permutation which generates values in [0,m), -// from a given Sequence and seed value. -// -// The seed parameter selects different shuffles, and is useful if you need -// to generate multiple distinct shuffles from the same underlying sequence. -// Treat it as a secondary seed. -func NewPermutation(max int64, seed uint32, src Sequence) (*Permutation, error) { - if max < 1 { - return nil, errors.New("period must be positive") - } - // number of rounds to get "good" results is roughly 6 log N. - bits := 64 - bits.LeadingZeros64(uint64(max)) - p := Permutation{max: max, rounds: 6 * bits, counter: 0} - - p.src = src - p.k = make([]uint64, p.rounds) - p.permSeed = seed - offset := OffsetFor(SequencePermutationK, p.permSeed, 0, 0) - for i := uint64(0); i < uint64(p.rounds); i++ { - offset.Lo = i - p.k[i] = p.src.BitsAt(offset).Lo % uint64(p.max) - } - return &p, nil -} - -// Next generates the next value from the permutation. -func (p *Permutation) Next() (ret int64) { - return p.nextValue() -} - -// Nth generates the Nth value from the permutation. For instance, -// given a new permutation, calling Next once produces the same -// value you'd get from calling Nth(0). This is a seek which changes -// the offset that Next will count from; after calling Nth(x), you -// would get the same result from Next() that you would from Nth(x+1). -func (p *Permutation) Nth(n int64) (ret int64) { - p.counter = n - ret = p.nextValue() - return ret -} - -func (p *Permutation) nextValue() int64 { - p.counter = int64(uint64(p.counter) % uint64(p.max)) - x := uint64(p.counter) - p.counter++ - // a value which can't possibly be the next value we need, so we - // always hash on the first pass. - prev := uint64(p.max) + 1 - offset := OffsetFor(SequencePermutationF, p.permSeed, 0, 0) - for i := uint64(0); i < uint64(p.rounds); i++ { - if i > 0 && i&127 == 0 { - offset.Hi++ - // force regeneration of bits down below - prev = uint64(p.max) + 1 - } - xPrime := (p.k[i] + uint64(p.max) - x) % uint64(p.max) - xCaret := x - if xPrime > xCaret { - xCaret = xPrime - } - if xCaret != prev { - offset.Lo = xCaret - p.bits = p.src.BitsAt(offset) - prev = xCaret - } - if p.bits.Bit(i) != 0 { - x = xPrime - } - } - return int64(x) -} diff --git a/apophenia/permute_test.go b/apophenia/permute_test.go deleted file mode 100644 index 71e1f59..0000000 --- a/apophenia/permute_test.go +++ /dev/null @@ -1,128 +0,0 @@ -package apophenia - -import ( - "fmt" - "testing" -) - -func PermutationOrBust(period int64, seed int64, expectedErr string, tb testing.TB) *Permutation { - seq := NewSequence(seed) - p, err := NewPermutation(period, 0, seq) - if err != nil { - if expectedErr == "" || expectedErr != err.Error() { - tb.Fatalf("unexpected error creating permutation generator: %s", err) - } - return p - } - if p == nil { - tb.Fatalf("unexpected nil permutation generator without error") - } - return p -} - -func Test_PermuteCycle(t *testing.T) { - sizes := []int64{8, 23, 64, 10000} - for _, size := range sizes { - p := PermutationOrBust(size, 0, "", t) - seen := make(map[int64]struct{}, size) - for i := int64(0); i < size; i++ { - n := p.Next() - if _, ok := seen[n]; ok { - list := make([]int64, len(seen)) - j := 0 - for k := range seen { - list[j] = k - j++ - } - t.Fatalf("size %d: got duplicate entry %d in %v", size, n, list) - } - seen[n] = struct{}{} - } - } -} - -func TestPermuteSeed(t *testing.T) { - size := int64(129) - seeds := int64(8) - p := make([]*Permutation, seeds) - seen := make([]map[int64]struct{}, seeds) - for s := int64(0); s < seeds; s++ { - p[s] = PermutationOrBust(size, s, "", t) - seen[s] = make(map[int64]struct{}, size) - } - matches := int64(0) - v := make([]int64, seeds) - for i := int64(0); i < size; i++ { - for s := int64(0); s < seeds; s++ { - v[s] = p[s].Next() - if _, ok := seen[s][v[s]]; ok { - t.Fatalf("duplicate entry (size %d, seed %d, entry %d): %d", - size, s, i, v[s]) - } - seen[s][v[s]] = struct{}{} - } - for s := int64(1); s < seeds; s++ { - if v[s] == v[s-1] { - matches++ - } - } - } - // assuming number of outcomes is more than about 16, matches are pretty rare if nothing - // is wrong. - if (matches * 8) > (size * seeds) { - t.Fatalf("too many matches: %d values to permute, %d seeds, %d matches seems suspicious.", - size, seeds, matches) - } else { - t.Logf("permuting %d values across %d seeds: %d matches (OK)", size, seeds, matches) - } -} - -func Test_PermuteNth(t *testing.T) { - size := int64(129) - seeds := int64(8) - p := make([][]*Permutation, seeds) - seen := make([]map[int64]struct{}, seeds) - for s := int64(0); s < seeds; s++ { - p[s] = make([]*Permutation, 2) - p[s][0] = PermutationOrBust(size, s, "", t) - p[s][1] = PermutationOrBust(size, s, "", t) - seen[s] = make(map[int64]struct{}, size) - } - matches := int64(0) - v := make([]int64, seeds) - for i := int64(0); i < size; i++ { - for s := int64(0); s < seeds; s++ { - v[s] = p[s][0].Next() - if _, ok := seen[s][v[s]]; ok { - t.Fatalf("duplicate entry (size %d, seed %d, entry %d): %d", - size, s, i, v[s]) - } - seen[s][v[s]] = struct{}{} - vN := p[s][1].Nth(i) - if vN != v[s] { - t.Fatalf("Nth entry didn't match Nth call to Next()) (size %d, seed %d, n %d): expected %d, got %d\n", - size, s, i, v[s], vN) - } - } - } - // assuming number of outcomes is more than about 16, matches are pretty rare if nothing - // is wrong. - if (matches * 8) > (size * seeds) { - t.Fatalf("too many matches: %d values to permute, %d seeds, %d matches seems suspicious.", - size, seeds, matches) - } else { - t.Logf("permuting %d values across %d seeds: %d matches (OK)", size, seeds, matches) - } -} - -func Benchmark_PermuteCycle(b *testing.B) { - sizes := []int64{5, 63, 1000000, (1 << 19)} - for _, size := range sizes { - b.Run(fmt.Sprintf("Pool%d", size), func(b *testing.B) { - p := PermutationOrBust(size, 0, "", b) - for i := 0; i < b.N; i++ { - _ = p.Next() - } - }) - } -} diff --git a/apophenia/weighted.go b/apophenia/weighted.go deleted file mode 100644 index 0218e9c..0000000 --- a/apophenia/weighted.go +++ /dev/null @@ -1,97 +0,0 @@ -package apophenia - -import ( - "errors" - "math/bits" -) - -// Weighted provides a generator which produces weighted bits -- bits with -// a specified probability of being set, as opposed to random values weighted -// a given way. Bit density is specified as N/M, with M a (positive) -// power of 2, and N an integer between 0 and M. -// -// The underlying generator can produce 2^128 128-bit values, but the -// iterative process requires log2(densityScale) 128-bit values from the -// source per 128-bit output, so the top 7 bits of the address are used -// as an iteration counter. Thus, Weighted.Bit can produce 2^128 distinct -// values, but if the offset provided to Weighted.Bits is over 2^121, there -// will be overlap between the source values used for those bits, and -// source values used (for different iterations) for other offsets. -type Weighted struct { - src Sequence - // internal result cache - lastValue Uint128 - lastOffset Uint128 - lastDensity, lastScale uint64 -} - -// NewWeighted yields a new Weighted using the given sequence as a source of -// seekable pseudo-random bits. -func NewWeighted(src Sequence) (*Weighted, error) { - if src == nil { - return nil, errors.New("new Weighted requires a non-nil source") - } - w := Weighted{src: src} - return &w, nil -} - -// Bit returns the single 0-or-1 bit at the specified offset. -func (w *Weighted) Bit(offset Uint128, density uint64, scale uint64) uint64 { - var bit uint64 - // In order to be able to cache/reuse values, we want to grab a whole - // set of 128 bits including a given offset, and use the same - // calculation for all of them. So we mask out the low-order 7 bits - // of offset, and use them separately. Meanwhile, Bits will - // always right-shift its column bits by 7, which reduces the - // space of possible results but means that it produces the same - // set of bits for any given batch... - offset.Lo, bit = offset.Lo&^127, offset.Lo&127 - if offset == w.lastOffset && density == w.lastDensity && scale == w.lastScale { - return w.lastValue.Bit(bit) - } - w.lastValue = w.Bits(offset, density, scale) - w.lastOffset, w.lastDensity, w.lastScale = offset, density, scale - return w.lastValue.Bit(bit) -} - -const weightedIterationMask = (^uint64(0)) >> 7 - -// Bits returns the 128-bit set of bits including offset. The column portion -// of offset is right-shifted by 7 to match the offset calculations in Bit(), -// above. Thus, you get the same values back for each sequence of 128 consecutive -// offsets. -func (w *Weighted) Bits(offset Uint128, density uint64, scale uint64) (out Uint128) { - // magic accommodation for choices made elsewhere. - offset.Lo >>= 7 - if density == scale { - out.Not() - return out - } - if density == 0 { - return out - } - lz := uint(bits.TrailingZeros64(density)) - density >>= lz - scale >>= lz - // generate the same results we would have without this hackery - offset.Hi += uint64(lz) - for scale > 1 { - next := w.src.BitsAt(offset) - if density&1 != 0 { - out.Or(next) - } else { - out.And(next) - } - density >>= 1 - scale >>= 1 - // iteration is stashed in the bottom 24-bits of an offset - offset.Hi++ - } - return out -} - -// NextBits returns the next batch of bits after the last one retrieved. -func (w *Weighted) NextBits(density, scale uint64) (out Uint128) { - w.lastOffset.Inc() - return w.Bits(w.lastOffset, density, scale) -} diff --git a/apophenia/weighted_test.go b/apophenia/weighted_test.go deleted file mode 100644 index bd61bed..0000000 --- a/apophenia/weighted_test.go +++ /dev/null @@ -1,27 +0,0 @@ -package apophenia - -import ( - "fmt" - "testing" -) - -func Benchmark_WeightedDistribution(b *testing.B) { - src := NewSequence(0) - w, err := NewWeighted(src) - if err != nil { - b.Fatalf("couldn't make weighted: %v", err) - } - scales := []uint64{3, 6, 12, 18, 24, 63} - for _, scale := range scales { - off := OffsetFor(SequenceWeighted, 0, 0, 0) - scaled := uint64(1 << scale) - b.Run(fmt.Sprintf("Scale%d", scale), func(b *testing.B) { - for i := 0; i < b.N; i++ { - w.Bits(off, 1, scaled) - w.Bits(off, scaled/2, scaled) - w.Bits(off, scaled-1, scaled) - } - }) - } - -} diff --git a/apophenia/zipf.go b/apophenia/zipf.go deleted file mode 100644 index 457ec06..0000000 --- a/apophenia/zipf.go +++ /dev/null @@ -1,113 +0,0 @@ -package apophenia - -import ( - "fmt" - "math" -) - -// Zipf produces a series of values following a Zipf distribution. -// It is initialized with values q, v, and max, and produces values -// in the range [0,max) such that the probability of a value k is -// proportional to (v+k) ** -q. v must be >= 1, q must be > 1. -// -// This is based on the same paper used for the golang stdlib Zipf -// distribution: -// -// "Rejection-Inversion to Generate Variates -// from Monotone Discrete Distributions" -// W.Hormann, G.Derflinger [1996] -// http://eeyore.wu-wien.ac.at/papers/96-04-04.wh-der.ps.gz -// -// This implementation differs from stdlib's in that it is seekable; you -// can get the Nth value in a theoretical series of results in constant -// time, without having to generate the whole series linearly. -type Zipf struct { - src Sequence - seed uint32 - q float64 - v float64 - max float64 - oneMinusQ float64 - oneOverOneMinusQ float64 - h func(float64) float64 - hInv func(float64) float64 - hImaxOneHalf float64 - hX0MinusHImaxOneHalf float64 // hX0 is only ever used as hX0 - h(i[max] + 1/2) - s float64 - idx uint64 -} - -// NewZipf returns a new Zipf object with the specified q, v, and -// max, and with its random source seeded in some way by seed. -// The sequence of values returned is consistent for a given set -// of inputs. The seed parameter can select one of multiple sub-sequences -// of the given sequence. -func NewZipf(q float64, v float64, max uint64, seed uint32, src Sequence) (z *Zipf, err error) { - if math.IsNaN(q) || math.IsNaN(v) { - return nil, fmt.Errorf("q (%g) and v (%g) must not be NaN for Zipf distribution", q, v) - } - if q <= 1 || v < 1 { - return nil, fmt.Errorf("need q > 1 (got %g) and v >= 1 (got %g) for Zipf distribution", q, v) - } - if src == nil { - return nil, fmt.Errorf("need a usable PRNG apophenia.Sequence") - } - oneMinusQ := 1 - q - oneOverOneMinusQ := 1 / (1 - q) - z = &Zipf{ - q: q, - v: v, - max: float64(max), - seed: seed, - oneMinusQ: oneMinusQ, - oneOverOneMinusQ: oneOverOneMinusQ, - } - z.h = func(x float64) float64 { - return math.Exp((1-q)*math.Log(v+x)) * oneOverOneMinusQ - } - z.hInv = func(x float64) float64 { - return -v + math.Exp(oneOverOneMinusQ*math.Log(oneMinusQ*x)) - } - hX0 := z.h(0.5) - math.Exp(math.Log(v)*-q) - z.hImaxOneHalf = z.h(z.max + 0.5) - z.hX0MinusHImaxOneHalf = hX0 - z.hImaxOneHalf - z.s = 1 - z.hInv(z.h(1.5)-math.Exp(math.Log(v+1)*-q)) - z.src = src - if err != nil { - return nil, err - } - return z, nil -} - -// Nth returns the Nth value from the sequence associated with the -// given Zipf. For a given set of input values (q, v, max, and seed), -// and a given index, the same value is returned. -func (z *Zipf) Nth(index uint64) uint64 { - z.idx = index - offset := OffsetFor(SequenceZipfU, z.seed, 0, index) - for { - bits := z.src.BitsAt(offset) - uInt := bits.Lo - u := float64(uInt&(1<<53-1)) / (1 << 53) - u = z.hImaxOneHalf + u*z.hX0MinusHImaxOneHalf - x := z.hInv(u) - k := math.Floor(x + 0.5) - if k-x <= z.s { - return uint64(k) - } - if u >= z.h(k+0.5)-math.Exp(-math.Log(z.v+k)*z.q) { - return uint64(k) - } - // the low-order 24 bits of the high-order 64-bit word - // are the "iteration", which started as zero. Assuming we - // don't need more than ~16.7M values, we're good. The expected - // average is about 1.1. - offset.Hi++ - } -} - -// Next returns the "next" value -- the one after the last one requested, or -// value 1 if none have been requested before. -func (z *Zipf) Next() uint64 { - return z.Nth(z.idx + 1) -} diff --git a/apophenia/zipf_test.go b/apophenia/zipf_test.go deleted file mode 100644 index fab6b1b..0000000 --- a/apophenia/zipf_test.go +++ /dev/null @@ -1,107 +0,0 @@ -package apophenia - -import ( - "fmt" - "math" - "math/rand" - "testing" -) - -type testCase struct { - name string - s, v float64 - m uint64 -} - -var testCases = []testCase{ - {s: 1.01, v: 1, m: 100}, - {s: 2, v: 1, m: 100}, - {s: 1.01, v: 100, m: 1000}, - {s: 2, v: 10000, m: 1000}, -} - -func (tc testCase) Name() string { - if tc.name != "" { - return tc.name - } - return fmt.Sprintf("(zipf:s%f,v%f,m%d)", tc.s, tc.v, tc.m) -} - -func runZipf(zf func() uint64, values []uint64, n uint64, t *testing.T) { - for i := uint64(0); i < n; i++ { - x := zf() - if x < 0 || x >= uint64(len(values)) { - t.Fatalf("got out-of-range value %d from zipf function", x) - } - values[x]++ - } -} - -type zipfTestCase struct { - q, v float64 - seq Sequence - exp string -} - -func (z zipfTestCase) String() string { - return fmt.Sprintf("q: %g, v: %g, seq: %t, expected error: %t", - z.q, z.v, z.seq != nil, z.exp != "") -} - -func Test_InvalidInputs(t *testing.T) { - seq := NewSequence(0) - testCases := []zipfTestCase{ - {q: 1, v: 1.1, seq: seq, exp: "need q > 1 (got 1) and v >= 1 (got 1.1) for Zipf distribution"}, - {q: 1.1, v: 0.99, seq: seq, exp: "need q > 1 (got 1.1) and v >= 1 (got 0.99) for Zipf distribution"}, - {q: 1.1, v: 1.1, seq: nil, exp: "need a usable PRNG apophenia.Sequence"}, - {q: math.NaN(), v: 1.1, seq: nil, exp: "q (NaN) and v (1.1) must not be NaN for Zipf distribution"}, - {q: 1.01, v: 2, seq: seq, exp: ""}, - } - for _, c := range testCases { - z, err := NewZipf(c.q, c.v, 20, 0, c.seq) - if c.exp != "" { - if err == nil { - t.Errorf("case %v: expected error '%s', got no error", c, c.exp) - } else if err.Error() != c.exp { - t.Errorf("case %v: expected error '%s', got error '%s'", c, c.exp, err.Error()) - } - } else { - if err != nil { - t.Errorf("case %v: unexpected error %v", c, err) - } else if z == nil { - t.Errorf("case %v: nil Zipf despite no error", c) - } - } - } -} - -const runs = 1000000 - -func Test_CompareWithMath(t *testing.T) { - failed := false - for idx, c := range testCases { - stdlibValues := make([]uint64, c.m+1) - zipfValues := make([]uint64, c.m+1) - stdlibZipf := rand.NewZipf(rand.New(rand.NewSource(int64(idx))), c.s, c.v, c.m) - seq := NewSequence(int64(idx)) - zipfZipf, err := NewZipf(c.s, c.v, c.m, 0, seq) - if err != nil { - t.Fatalf("failed to create newZipf: %s", err) - } - runZipf(stdlibZipf.Uint64, stdlibValues, runs, t) - runZipf(zipfZipf.Next, zipfValues, runs, t) - for i := uint64(0); i < c.m; i++ { - stdlibP := float64(stdlibValues[i]) / runs - zipfP := float64(zipfValues[i]) / runs - diff := math.Abs(stdlibP - zipfP) - if diff > 0.001 { - failed = true - t.Logf("%s: stdlib %d, zipf %d, diff %f [s %f, v %f]", - c.Name(), stdlibValues[i], zipfValues[i], diff, c.s, c.v) - } - } - } - if failed { - t.Fail() - } -} diff --git a/bench/zipf.go b/bench/zipf.go index 7ed871e..7a7b710 100644 --- a/bench/zipf.go +++ b/bench/zipf.go @@ -9,8 +9,8 @@ import ( "os" "time" + "github.com/molecula/apophenia" "github.com/pilosa/go-pilosa" - "github.com/pilosa/tools/apophenia" ) // ZipfBenchmark sets random bits according to the Zipf-Mandelbrot distribution. diff --git a/go.mod b/go.mod index 09af251..596ede5 100644 --- a/go.mod +++ b/go.mod @@ -14,6 +14,7 @@ require ( github.com/jaffee/commandeer v0.1.0 github.com/kr/pty v1.1.8 // indirect github.com/miekg/dns v1.1.15 // indirect + github.com/molecula/apophenia v0.0.0-20190827192002-68b7a14a478b github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect github.com/pilosa/go-pilosa v1.3.1-0.20190715210601-8606626b90d6 github.com/pilosa/pilosa v1.3.1 diff --git a/go.sum b/go.sum index d5de1de..6d3f703 100644 --- a/go.sum +++ b/go.sum @@ -185,6 +185,8 @@ github.com/mitchellh/mapstructure v1.1.2 h1:fmNYVwqnSfB9mZU6OS2O6GsXM+wcskZDuKQz github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/molecula/apophenia v0.0.0-20190827192002-68b7a14a478b h1:cZADDaNYM7xn/nklO3g198JerGQjadFuA0ofxBJgK0Y= +github.com/molecula/apophenia v0.0.0-20190827192002-68b7a14a478b/go.mod h1:uXd1BiH7xLmgkhVmspdJLENv6uGWrTL/MQX2TN7Yz9s= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=