diff --git a/cmd/create.go b/cmd/create.go
index b08673c..708214c 100644
--- a/cmd/create.go
+++ b/cmd/create.go
@@ -59,7 +59,7 @@ Example:
}
GlobalConfig.Parallel = min(GlobalConfig.Parallel, len(createTableDDLs))
- logrus.Infof("Create %d table(s) and %d view(s), parallel: %d\n", len(createTableDDLs), len(createOtherDDLs), GlobalConfig.Parallel)
+ logrus.Infof("Create %d table(s) and %d view(s), parallel: %d", len(createTableDDLs), len(createOtherDDLs), GlobalConfig.Parallel)
db, err := connectDBWithoutDBName()
if err != nil {
@@ -174,7 +174,7 @@ func completeCreateConfig() (err error) {
fmatch := filepath.Join(ddldir, fmt.Sprintf("%s.*.table.sql", db))
tableddls, err := src.FileGlob([]string{fmatch})
if err != nil {
- logrus.Errorf("Get db '%s' ddls in '%s' failed\n", db, fmatch)
+ logrus.Errorf("Get db '%s' ddls in '%s' failed", db, fmatch)
return err
}
createTableDDLs = append(createTableDDLs, tableddls...)
@@ -182,7 +182,7 @@ func completeCreateConfig() (err error) {
fmatch = filepath.Join(ddldir, fmt.Sprintf("%s.*view.sql", db))
viewddls, err := src.FileGlob([]string{fmatch})
if err != nil {
- logrus.Errorf("Get db '%s' ddls in '%s' failed\n", db, fmatch)
+ logrus.Errorf("Get db '%s' ddls in '%s' failed", db, fmatch)
return err
}
createOtherDDLs = append(createOtherDDLs, viewddls...)
diff --git a/cmd/diff.go b/cmd/diff.go
index 204efe2..98c2802 100644
--- a/cmd/diff.go
+++ b/cmd/diff.go
@@ -124,7 +124,7 @@ func diffDumpSQL(replay string) error {
client := strings.TrimSuffix(filepath.Base(path2), src.ReplayResultFileExt)
clientsqls, ok := client2sqls[client]
if !ok {
- logrus.Errorf("client %s not found in original dump sql, skipping\n", client)
+ logrus.Errorf("client %s not found in original dump sql, skipping", client)
return nil
}
@@ -135,11 +135,11 @@ func diffDumpSQL(replay string) error {
defer f2.Close()
scan2 := bufio.NewScanner(f2)
- logrus.Debugf("diffing %s:\n", path2)
+ logrus.Debugf("diffing %s:", path2)
id2sqls := lo.SliceToMap(clientsqls, func(s *src.ReplaySql) (string, *src.ReplaySql) { return s.QueryId, s })
if err := diff(&diffReader{id2sqls: id2sqls}, &diffReader{scan: scan2}); err != nil {
- logrus.Errorf("diff %s failed, err: %v\n", path2, err)
+ logrus.Errorf("diff %s failed, err: %v", path2, err)
}
return nil
})
@@ -217,10 +217,10 @@ func diffTwoReplays(replay1, replay2 string) error {
defer f2.Close()
scan2 := bufio.NewScanner(f2)
- logrus.Debugf("diffing %s and %s\n", path1, path2)
+ logrus.Debugf("diffing %s and %s", path1, path2)
if err := diff(&diffReader{scan: scan1}, &diffReader{scan: scan2}); err != nil {
- logrus.Errorf("diff %s and %s failed, err: %v\n", path1, path2, err)
+ logrus.Errorf("diff %s and %s failed, err: %v", path1, path2, err)
}
return nil
})
@@ -248,7 +248,7 @@ func diff(scan1, scan2 *diffReader) error {
// print diff result
for id, diffmsg := range id2diff {
- fmt.Printf("QueryId: %s, %s\n", color.CyanString(id), diffmsg)
+ fmt.Printf("QueryId: %s, %s", color.CyanString(id), diffmsg)
if len(scan1.id2sqls) > 0 {
if s, ok := scan1.id2sqls[id]; ok {
fmt.Printf("Stmt: %s", s.Stmt)
@@ -279,7 +279,7 @@ func (r *diffReader) get(queryId string) *src.ReplayResult {
}
result := &src.ReplayResult{}
if err := json.Unmarshal(b, result); err != nil {
- logrus.Errorf("unmarshal %s failed, err: %v\n", r.scan.Text(), err)
+ logrus.Errorf("unmarshal %s failed, err: %v", r.scan.Text(), err)
return nil
}
return result
diff --git a/cmd/dump.go b/cmd/dump.go
index dc5d286..f49bf54 100644
--- a/cmd/dump.go
+++ b/cmd/dump.go
@@ -112,7 +112,7 @@ or environment variables with prefix 'DORIS_', e.g.
return err
}
- logrus.Infof("Found %d schema(s)\n", lo.SumBy(schemas, func(s *src.DBSchema) int { return len(s.Schemas) }))
+ logrus.Infof("Found %d schema(s)", lo.SumBy(schemas, func(s *src.DBSchema) int { return len(s.Schemas) }))
if err := outputSchemas(schemas); err != nil {
return err
@@ -126,7 +126,7 @@ or environment variables with prefix 'DORIS_', e.g.
return err
}
- logrus.Infof("Found %d query(s)\n", count)
+ logrus.Infof("Found %d query(s)", count)
}
// store anonymize hash dict
@@ -240,7 +240,7 @@ func dumpSchemas(ctx context.Context) ([]*src.DBSchema, error) {
schemas := make([]*src.DBSchema, len(dbs))
for i, db := range dbs {
g.Go(func() error {
- logrus.Infof("Dumping schemas from %s...\n", db)
+ logrus.Infof("Dumping schemas from %s...", db)
conn, err := connectDB(db)
if err != nil {
return err
@@ -382,14 +382,14 @@ func dumpQueriesFromTable(ctx context.Context, opts src.AuditLogScanOpts) (int,
return 0, err
}
- logrus.Infof("Dumping queries from audit log table '%s'...\n", DumpConfig.AuditLogTable)
+ logrus.Infof("Dumping queries from audit log table '%s'...", DumpConfig.AuditLogTable)
w := NewQueryWriter(1, 0)
defer w.Close()
count, err := src.GetDBAuditLogs(ctx, w, db, dbname, table, opts, GlobalConfig.Parallel)
if err != nil {
- logrus.Errorf("Extract queries from audit logs table failed, %v\n", err)
+ logrus.Errorf("Extract queries from audit logs table failed, %v", err)
return 0, err
}
@@ -451,7 +451,7 @@ func dumpQueriesFromFile(ctx context.Context, opts src.AuditLogScanOpts) (int, e
GlobalConfig.Parallel,
)
if err != nil {
- logrus.Errorf("Extract queries from audit logs file failed, %v\n", err)
+ logrus.Errorf("Extract queries from audit logs file failed, %v", err)
return 0, err
}
diff --git a/cmd/export.go b/cmd/export.go
index f19745d..b997a84 100644
--- a/cmd/export.go
+++ b/cmd/export.go
@@ -68,7 +68,7 @@ Example:
}
GlobalConfig.Parallel = min(GlobalConfig.Parallel, len(GlobalConfig.Tables))
- logrus.Infof("Export data for %d table(s) to '%s', parallel: %d\n", len(GlobalConfig.Tables), ExportConfig.ToURL, GlobalConfig.Parallel)
+ logrus.Infof("Export data for %d table(s) to '%s', parallel: %d", len(GlobalConfig.Tables), ExportConfig.ToURL, GlobalConfig.Parallel)
if len(GlobalConfig.Tables) == 0 {
return nil
}
diff --git a/cmd/gendata.go b/cmd/gendata.go
index 69facf3..074bfeb 100644
--- a/cmd/gendata.go
+++ b/cmd/gendata.go
@@ -35,6 +35,8 @@ import (
"github.com/Thearas/dodo/src/parser"
)
+const MaxGenconfs = 128 // Maximum number of genconf in a genconf YAML file
+
// GendataConfig holds the configuration values
var GendataConfig = Gendata{}
@@ -79,19 +81,18 @@ Example:
}
GlobalConfig.Parallel = min(GlobalConfig.Parallel, len(GendataConfig.genFromDDLs))
- logrus.Infof("Generate data for %d table(s), parallel: %d\n", len(GendataConfig.genFromDDLs), GlobalConfig.Parallel)
+ logrus.Infof("Generate data for %d table(s), parallel: %d", len(GendataConfig.genFromDDLs), GlobalConfig.Parallel)
if len(GendataConfig.genFromDDLs) == 0 {
return nil
}
- // 1. Construct table generators
+ // 1. Find ddl and column stats.
var (
- tableGens []*src.TableGen
- tables = make([]string, len(GendataConfig.genFromDDLs))
- statss = make([]*src.TableStats, len(GendataConfig.genFromDDLs))
+ tables = make([]string, len(GendataConfig.genFromDDLs))
+ statss = make([]*src.TableStats, len(GendataConfig.genFromDDLs))
)
for i, ddlFile := range GendataConfig.genFromDDLs {
- logrus.Debugf("generating data to %s ...\n", strings.TrimSuffix(ddlFile, ".table.sql"))
+ logrus.Debugf("generating data to %s ...", strings.TrimSuffix(ddlFile, ".table.sql"))
ddl, err := src.ReadFileOrStdin(ddlFile)
if err != nil {
@@ -104,9 +105,11 @@ Example:
tables[i] = ddl
statss[i] = stats
}
- // anonymize
+
+ // 2. LLM gen configuration.
+ // anonymize SQLs before sending to LLM
query := GendataConfig.Query
- rawTables := tables
+ origTableDDLs := tables
if AnonymizeConfig.Enabled {
SetupAnonymizer()
tables = lo.Map(tables, func(t string, i int) string { return AnonymizeSQL(GendataConfig.genFromDDLs[i], t) })
@@ -117,7 +120,7 @@ Example:
useLLM := GendataConfig.GenConf == "" && GendataConfig.LLM != ""
if useLLM {
genconfPath := filepath.Join(GlobalConfig.DodoDataDir, "gendata.yaml")
- logrus.Infof("Generating config '%s' via LLM model: %s, with anonymization: %v\n", genconfPath, GendataConfig.LLM, AnonymizeConfig.Enabled)
+ logrus.Infof("Generating config '%s' via LLM model: %s, with anonymization: %v", genconfPath, GendataConfig.LLM, AnonymizeConfig.Enabled)
genconf, err := src.LLMGendataConfig(
ctx,
@@ -126,7 +129,7 @@ Example:
[]string{query},
)
if err != nil {
- logrus.Errorf("Failed to create gendata config via LLM %s\n", GendataConfig.LLM)
+ logrus.Errorf("Failed to create gendata config via LLM %s", GendataConfig.LLM)
return err
}
@@ -135,7 +138,7 @@ Example:
return err
}
if err := src.WriteFile(genconfPath, genconf); err != nil {
- logrus.Errorf("Failed to write gendata config to %s\n", genconfPath)
+ logrus.Errorf("Failed to write gendata config to %s", genconfPath)
return err
}
if !src.Confirm(fmt.Sprintf("Using LLM output config: '%s', please check it before going on", genconfPath)) {
@@ -145,111 +148,8 @@ Example:
GendataConfig.GenConf = genconfPath
}
- // 2. Setup generator
- if err := generator.Setup(GendataConfig.GenConf); err != nil {
- return err
- }
- for i, ddlFile := range GendataConfig.genFromDDLs {
- // set streamload column mapping to the unanonymized version
- streamloadCols := []string{}
- if AnonymizeConfig.Enabled {
- streamloadCols, err = parser.GetTableCols(ddlFile, rawTables[i])
- if err != nil {
- return fmt.Errorf("failed to get columns for table %s: %v", rawTables[i], err)
- }
- }
-
- tg, err := src.NewTableGen(ddlFile, tables[i], statss[i], GendataConfig.NumRows, streamloadCols)
- if err != nil {
- return err
- }
-
- tableGens = append(tableGens, tg)
- }
-
- if GlobalConfig.DryRun {
- return nil
- } else if len(tableGens) == 0 {
- logrus.Infoln("No table to generate.")
- return nil
- }
- // store anonymize hash dict
- if AnonymizeConfig.Enabled {
- src.StoreMiniHashDict(AnonymizeConfig.Method, AnonymizeConfig.HashDictPath)
- }
-
- // 3. Generate data according to table ref dependence
- var (
- allTables = lo.Map(tableGens, func(tg *src.TableGen, _ int) string { return tg.Name })
- refTables = lo.Uniq(lo.Flatten(lo.Map(tableGens, func(tg *src.TableGen, _ int) []string { return slices.Collect(maps.Keys(tg.RefToTable)) })))
-
- refNotFoundTable = lo.Without(refTables, allTables...)
- )
- if len(refNotFoundTable) > 0 {
- return fmt.Errorf("these tables are being ref, please generate them together: %v", refNotFoundTable)
- }
-
- totalTableGens := len(allTables)
- for range totalTableGens {
- if len(tableGens) == 0 {
- return nil
- }
-
- zeroRefTableGens := lo.Filter(tableGens, func(tg *src.TableGen, _ int) bool { return len(tg.RefToTable) == 0 })
- tableGens = lo.Filter(tableGens, func(tg *src.TableGen, _ int) bool { return len(tg.RefToTable) > 0 })
-
- // check ref deadlock
- if len(zeroRefTableGens) == 0 {
- remainTable2Refs := lo.SliceToMap(tableGens, func(tg *src.TableGen) (string, []string) {
- return tg.Name, slices.Collect(maps.Keys(tg.RefToTable))
- })
- return fmt.Errorf("table refs deadlock: %v", remainTable2Refs)
- }
-
- // Generate the tables with zero ref.
- g := src.ParallelGroup(GlobalConfig.Parallel)
- for _, tg := range zeroRefTableGens {
- logrus.Infof("Generating data for table: %s, rows: %d\n", tg.Name, tg.Rows)
- g.Go(func() error {
- rowsPerFile := min(GendataConfig.RowsPerFile, tg.Rows)
- for i, end := range lo.RangeWithSteps(0, tg.Rows+rowsPerFile, rowsPerFile) {
- rows := rowsPerFile
- if end >= tg.Rows {
- rows = tg.Rows % rowsPerFile
- }
- if rows == 0 {
- break
- }
- o, err := createOutputGenDataWriter(tg.DDLFile, i+1)
- if err != nil {
- return err
- }
-
- w := bufio.NewWriterSize(o, 256*1024)
- if err := tg.GenCSV(w, rows); err != nil {
- _ = o.Close()
- return err
- }
- if err := w.Flush(); err != nil {
- _ = o.Close()
- return err
- }
- _ = o.Close()
- }
- logrus.Infof("Finish generating data for table: %s\n", tg.Name)
- return nil
- })
-
- // the ref table data is generating, remove from all waiting tableGens
- lo.ForEach(tableGens, func(g *src.TableGen, _ int) { g.RemoveRefTable(tg.Name) })
- }
-
- if err := g.Wait(); err != nil {
- return err
- }
- }
-
- return nil
+ // 3. Run data generation.
+ return MRunGenerateData(origTableDDLs, tables, statss)
},
}
@@ -314,7 +214,7 @@ func completeGendataConfig() (err error) {
fmatch := filepath.Join(GendataConfig.DDL, fmt.Sprintf("%s.*.table.sql", db))
tableddls, err := src.FileGlob([]string{fmatch})
if err != nil {
- logrus.Errorf("Get db '%s' ddls in '%s' failed\n", db, fmatch)
+ logrus.Errorf("Get db '%s' ddls in '%s' failed", db, fmatch)
return err
}
ddls = append(ddls, tableddls...)
@@ -330,6 +230,137 @@ func completeGendataConfig() (err error) {
return nil
}
+func MRunGenerateData(origTableDDLs, anonymizedTables []string, statss []*src.TableStats) (err error) {
+ // may have multi genconf in one genconf YAML file, separate by '---'
+ for i := range MaxGenconfs {
+ if err := RunGenerateData(origTableDDLs, anonymizedTables, statss, i); err != nil {
+ if errors.Is(err, &src.GenconfEndError{}) {
+ return nil
+ }
+ return err
+ }
+ logrus.Infoln("===")
+ logrus.Infof("=== Generation success (round %d) ===", i+1)
+ logrus.Infoln("===")
+ }
+ return nil
+}
+
+func RunGenerateData(origTableDDLs, anonymizedTables []string, statss []*src.TableStats, genconfIdx int) (err error) {
+ // 1. Setup generator
+ genconf := GendataConfig.GenConf
+ if err := generator.Setup(genconf, genconfIdx); err != nil {
+ if !errors.Is(err, &src.GenconfEndError{}) {
+ logrus.Errorf("Failed to read config file '%s': %v", genconf, err)
+ }
+ return err
+ }
+
+ // 2. Construct generator for each table
+ tableGens := make([]*src.TableGen, 0, len(GendataConfig.genFromDDLs))
+ for i, ddlFile := range GendataConfig.genFromDDLs {
+ // set streamload column mapping to the unanonymized version
+ streamloadCols := []string{}
+ if AnonymizeConfig.Enabled {
+ streamloadCols, err = parser.GetTableCols(ddlFile, origTableDDLs[i])
+ if err != nil {
+ return fmt.Errorf("failed to get columns for table %s: %v", origTableDDLs[i], err)
+ }
+ }
+
+ tg, err := src.NewTableGen(ddlFile, anonymizedTables[i], statss[i], GendataConfig.NumRows, streamloadCols)
+ if err != nil {
+ return err
+ }
+
+ tableGens = append(tableGens, tg)
+ }
+
+ if GlobalConfig.DryRun {
+ return nil
+ } else if len(tableGens) == 0 {
+ logrus.Infoln("No table to generate.")
+ return nil
+ }
+ // store anonymize hash dict
+ if AnonymizeConfig.Enabled {
+ src.StoreMiniHashDict(AnonymizeConfig.Method, AnonymizeConfig.HashDictPath)
+ }
+
+ // 3. Generate data according to table ref dependence
+ var (
+ allTables = lo.Map(tableGens, func(tg *src.TableGen, _ int) string { return tg.Name })
+ refTables = lo.Uniq(lo.Flatten(lo.Map(tableGens, func(tg *src.TableGen, _ int) []string { return slices.Collect(maps.Keys(tg.RefToTable)) })))
+
+ refNotFoundTable = lo.Without(refTables, allTables...)
+ )
+ if len(refNotFoundTable) > 0 {
+ return fmt.Errorf("these tables are being ref, please generate them together: %v", refNotFoundTable)
+ }
+
+ totalTableGens := len(allTables)
+ for range totalTableGens {
+ if len(tableGens) == 0 {
+ return nil
+ }
+
+ zeroRefTableGens := lo.Filter(tableGens, func(tg *src.TableGen, _ int) bool { return len(tg.RefToTable) == 0 })
+ tableGens = lo.Filter(tableGens, func(tg *src.TableGen, _ int) bool { return len(tg.RefToTable) > 0 })
+
+ // check ref deadlock
+ if len(zeroRefTableGens) == 0 {
+ remainTable2Refs := lo.SliceToMap(tableGens, func(tg *src.TableGen) (string, []string) {
+ return tg.Name, slices.Collect(maps.Keys(tg.RefToTable))
+ })
+ return fmt.Errorf("table refs deadlock: %v", remainTable2Refs)
+ }
+
+ // Generate the tables with zero ref.
+ g := src.ParallelGroup(GlobalConfig.Parallel)
+ for _, tg := range zeroRefTableGens {
+ logrus.Infof("Generating data for table: %s, rows: %d", tg.Name, tg.Rows)
+ g.Go(func() error {
+ rowsPerFile := min(GendataConfig.RowsPerFile, tg.Rows)
+ for i, end := range lo.RangeWithSteps(0, tg.Rows+rowsPerFile, rowsPerFile) {
+ rows := rowsPerFile
+ if end >= tg.Rows {
+ rows = tg.Rows % rowsPerFile
+ }
+ if rows == 0 {
+ break
+ }
+ o, err := createOutputGenDataWriter(tg.DDLFile, genconfIdx, i)
+ if err != nil {
+ return err
+ }
+
+ w := bufio.NewWriterSize(o, 256*1024)
+ if err := tg.GenCSV(w, rows); err != nil {
+ _ = o.Close()
+ return err
+ }
+ if err := w.Flush(); err != nil {
+ _ = o.Close()
+ return err
+ }
+ _ = o.Close()
+ }
+ logrus.Infof("Finish generating data for table: %s", tg.Name)
+ return nil
+ })
+
+ // the ref table data is generating, remove from all waiting tableGens
+ lo.ForEach(tableGens, func(g *src.TableGen, _ int) { g.RemoveRefTable(tg.Name) })
+ }
+
+ if err := g.Wait(); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
+
func findTableStats(ddlFileName string) (*src.TableStats, error) {
ddlFileDir := filepath.Dir(ddlFileName)
ddlFileName = filepath.Base(ddlFileName)
@@ -344,7 +375,7 @@ func findTableStats(ddlFileName string) (*src.TableStats, error) {
b, err := os.ReadFile(dbStatsFile)
if err != nil {
if errors.Is(err, os.ErrNotExist) {
- logrus.Debugf("stats file '%s' not found for db '%s'\n", dbStatsFile, db)
+ logrus.Debugf("stats file '%s' not found for db '%s'", dbStatsFile, db)
return nil, nil
}
return nil, err
@@ -360,7 +391,7 @@ func findTableStats(ddlFileName string) (*src.TableStats, error) {
continue
}
if tableStats.Columns[0].Method != "FULL" {
- logrus.Warnf("Table stats '%s.%s' is '%s' in '%s', better to dump with '--analyze' or run 'ANALYZE DATABASE `%s` WITH SYNC' before dumping\n",
+ logrus.Warnf("Table stats '%s.%s' is '%s' in '%s', better to dump with '--analyze' or run 'ANALYZE DATABASE `%s` WITH SYNC' before dumping",
db, table,
tableStats.Columns[0].Method,
dbStatsFile,
@@ -370,7 +401,7 @@ func findTableStats(ddlFileName string) (*src.TableStats, error) {
return tableStats, nil
}
- logrus.Warnf("Table stats '%s.%s' not found in '%s', better to dump with '--analyze' or run 'ANALYZE DATABASE `%s` WITH SYNC' before dumping\n",
+ logrus.Warnf("Table stats '%s.%s' not found in '%s', better to dump with '--analyze' or run 'ANALYZE DATABASE `%s` WITH SYNC' before dumping",
db, table,
dbStatsFile,
db,
@@ -378,24 +409,27 @@ func findTableStats(ddlFileName string) (*src.TableStats, error) {
return nil, nil
}
-func createOutputGenDataWriter(ddlFileName string, idx int) (*os.File, error) {
- ddlFileName = filepath.Base(ddlFileName)
- dir := filepath.Join(GendataConfig.OutputDataDir, strings.TrimSuffix(strings.TrimSuffix(ddlFileName, ".table.sql"), ".sql"))
- if idx == 1 {
- // delete previous gen files
- logrus.Debugf("Deleting previous generated data files in %s\n", dir)
+func createOutputGenDataWriter(ddlFileName string, confIdx, datafileIdx int) (*os.File, error) {
+ dir := tableGenDataDir(ddlFileName)
+ if confIdx == 0 && datafileIdx == 0 {
+ // drop previous data dir
if err := os.RemoveAll(dir); err != nil {
return nil, err
}
- if err := os.MkdirAll(dir, 0755); err != nil {
- return nil, err
- }
+ }
+ if err := os.MkdirAll(dir, 0755); err != nil {
+ return nil, fmt.Errorf("failed to create output data dir '%s': %w", dir, err)
}
- file := filepath.Join(dir, fmt.Sprintf("%d.csv", idx))
+ file := filepath.Join(dir, fmt.Sprintf("%d_%d.csv", confIdx+1, datafileIdx+1))
f, err := os.OpenFile(file, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0600)
if err != nil {
- logrus.Fatalln("Can not open output data file:", file, ", err:", err)
+ return nil, fmt.Errorf("can not open output data file: %s, err: %w", file, err)
}
return f, nil
}
+
+func tableGenDataDir(ddlFilePath string) string {
+ ddlFileName := filepath.Base(ddlFilePath)
+ return filepath.Join(GendataConfig.OutputDataDir, strings.TrimSuffix(strings.TrimSuffix(ddlFileName, ".table.sql"), ".sql"))
+}
diff --git a/cmd/import.go b/cmd/import.go
index 632701d..7f175e3 100644
--- a/cmd/import.go
+++ b/cmd/import.go
@@ -67,7 +67,7 @@ Example:
}
GlobalConfig.Parallel = min(GlobalConfig.Parallel, len(ImportConfig.table2datafiles))
- logrus.Infof("Import data for %d tables, parallel: %d\n", len(ImportConfig.table2datafiles), GlobalConfig.Parallel)
+ logrus.Infof("Import data for %d tables, parallel: %d", len(ImportConfig.table2datafiles), GlobalConfig.Parallel)
g := src.ParallelGroup(GlobalConfig.Parallel)
for table, datafiles := range ImportConfig.table2datafiles {
@@ -127,7 +127,7 @@ func completeImportConfig() (err error) {
dbPrefix := db + "."
subdirs, err := os.ReadDir(ImportConfig.Data)
if err != nil {
- logrus.Errorf("Get db '%s' data file under '%s' failed\n", db, filepath.Join(ImportConfig.Data, fmt.Sprintf("%s.*", db)))
+ logrus.Errorf("Get db '%s' data file under '%s' failed", db, filepath.Join(ImportConfig.Data, fmt.Sprintf("%s.*", db)))
return err
}
datadirs := lo.FilterMap(subdirs, func(d os.DirEntry, _ int) (string, bool) {
@@ -157,7 +157,7 @@ func completeImportConfig() (err error) {
datadir := filepath.Join(ImportConfig.Data, table, "*")
datafiles, err := src.FileGlob([]string{datadir})
if err != nil {
- logrus.Errorf("Get table '%s' data files under '%s' failed\n", table, datadir)
+ logrus.Errorf("Get table '%s' data files under '%s' failed", table, datadir)
return err
}
if len(datafiles) == 0 {
diff --git a/cmd/replay.go b/cmd/replay.go
index 7685c29..111e1ba 100644
--- a/cmd/replay.go
+++ b/cmd/replay.go
@@ -144,7 +144,7 @@ func replay(ctx context.Context) error {
buf := bufio.NewScanner(f)
buf.Buffer(make([]byte, 0, 10*1024*1024), 10*1024*1024)
- logrus.Debugf("replay file %s with filter, db: %v, user: %v, from: %s, to: %s\n",
+ logrus.Debugf("replay file %s with filter, db: %v, user: %v, from: %s, to: %s",
ReplayConfig.ReplayFile,
ReplayConfig.DBs,
ReplayConfig.Users,
diff --git a/cmd/root.go b/cmd/root.go
index ac68069..e412e74 100644
--- a/cmd/root.go
+++ b/cmd/root.go
@@ -96,7 +96,7 @@ func init() {
rootCmd.PersistentFlags().SortFlags = false
rootCmd.Flags().SortFlags = false
- parallel := min(runtime.NumCPU(), DefaultParallel)
+ parallel := max(min(runtime.NumCPU()-2, DefaultParallel), 1) // at least 1 worker, no more than NumCPU-2
pFlags := rootCmd.PersistentFlags()
pFlags.StringVarP(&GlobalConfig.ConfigFile, "config", "C", "", "Config file (default is $HOME/.dodo.yaml)")
diff --git a/example/usercase/prompt.txt b/example/usercase/prompt.txt
index fcf0e1a..0ae7109 100644
--- a/example/usercase/prompt.txt
+++ b/example/usercase/prompt.txt
@@ -7,13 +7,13 @@
步骤(必须实际执行其中的命令):
1. 按照 dodo 配置 @dodo.yaml,用 mysql 命令创建好数据库,然后用 `dodo create --ddl` 创建好相应的表,如果不确定使用哪个数据库,请询问用户
-2. 用 `dodo gendata --prompt '每张表生成 1w 行数据'` 生成数据,注意使用 AI 功能,确保数据能够被查询出来,必要时进行多轮生成数据+导入数据
+2. 用 `dodo gendata --prompt '每张事实表 1w 行数据,维度表看情况'` 生成数据,注意使用 AI 功能,确保数据能够被查询出来,必要时进行多轮生成数据+导入数据
3. 额外进行一轮生成+导入,用于注入随机数据,使数据更接近真实,用 `dodo gendata --llm '' --rows 100000` 生成数据(--llm '' 代表不使用 AI 功能)
提示:
1. 执行 dodo 命令前,设置环境变量 DORIS_YES=1
2. 用 mysql 命令跑查询而不是 dodo replay
-3. 出错时,输出完整的出错命令和错误信息
+3. 使用 AI 功能生成数据时,可以通过 `--prompt` 提额外的需求给 AI
最后:
确认复现完成后,将跑过的命令整理成一个脚本文件 `run.sh`,将脚本文件放在本文件目录下。
diff --git a/introduction-zh.md b/introduction-zh.md
index e0067de..3286bad 100644
--- a/introduction-zh.md
+++ b/introduction-zh.md
@@ -17,9 +17,9 @@
- [format](#format)
- [gen](#gen)
- [inc](#inc)
+ - [ref](#ref)
- [enum](#enum)
- [parts](#parts)
- - [ref](#ref)
- [type](#type)
- [golang](#golang)
- [复杂类型](#复杂类型-maparraystructjsonvariant)
@@ -106,7 +106,7 @@ output
### 其他导出参数
- `--analyze` 导出表前自动跑 `ANALYZE TABLE
WITH SYNC`,使统计信息更准确,默认关闭
-- `--parallel` 控制导出并发量,调大导出更快,调小占用资源更少,默认 `min(机器核数, 10)`
+- `--parallel` 控制导出并发量,调大导出更快,调小占用资源更少,默认 `min(机器核数-2, 10)`
- `--dump-stats` 导出表时也导出统计信息,导出在 `output/ddl/db.stats.yaml` 文件,默认开启
- `--only-select` 是否从只导出 `SELECT` 语句,默认开启
- `--from` 和 `--to` 导出时间范围内的 SQL
@@ -180,7 +180,8 @@ dodo import --tables db1.table1 --data 'my_table/*.csv'
### 默认的生成规则
-默认不生成 `NULL`,可以在[自定义生成规则](#自定义生成规则)中指定 `null_frequency` 更改。
+- 默认不生成 `NULL`,可以在[自定义生成规则](#自定义生成规则)中指定 `null_frequency` 更改
+- 注意字符串类型是随机生成、不可预测的,字符集是大小写字母 + 数字 (a-z, A-Z, 0-9)
各类型的默认生成规则:
@@ -204,12 +205,28 @@ dodo import --tables db1.table1 --data 'my_table/*.csv'
| DATE | | 10 years ago - now | |
| DATETIME | | 10 years ago - now | |
-- 字符串类型是随机生成的,字符集是大小写字母 + 数字 (a-z, A-Z, 0-9)
-
### 自定义生成规则
生成数据时用 `--genconf gendata.yaml` 指定,完整示例见 [example/gendata.yaml](./example/gendata.yaml)。
+你可以将多个 `gendata.yaml` 内容合并到一个文件中(以 `---` 分隔)。这相当于多次调用 `dodo gendata --genconf `。例如:
+
+```yaml
+# Dataset 1
+null_frequency: 0
+type:
+...
+tables:
+...
+---
+# Dataset 2
+null_frequency: 0.05
+type:
+...
+tables:
+...
+```
+
#### 全局规则与表规则
生成规则可以分为全局和表级别。表级别会覆盖全局配置。
@@ -335,6 +352,34 @@ columns:
start: 100 # 从 100 开始(默认 1)
```
+##### ref
+
+引用生成器,随机使用其他表的列的值。
+一般在用于关系列之间,比如 `t1 JOIN t2 ON t1.c1 = t2.c1` 或 `WHERE t1.c1 = t2.c1`:
+
+```yaml
+columns:
+ - name: t_int
+ # format: "1{{%6d}}"
+ gen:
+ ref: employees.department_id
+ limit: 100 # 随机选择 100 个值(默认 1000)
+
+ - name: t_struct # struct
+ fields:
+ - name: dp_id
+ gen:
+ ref: employees.department_id # ref can be used in nested rules
+ - name: name
+ gen:
+ ref: employees.name
+```
+
+> [!IMPORTANT]
+>
+> - 引用的源表必须一起生成
+> - 引用之间不能有死锁
+
##### enum
枚举生成器,从给定值中随机选择,枚举值可以是字面量或者生成规则:
@@ -348,11 +393,13 @@ columns:
- name: t_str
gen:
- # 随机选择一个生成规则来生成值,各有 1/4 的概率被选中
+ # 随机选择一个生成规则来生成值,各有 1/5 的概率被选中
enum:
- length: 5
- length: {min: 5, max: 10}
- format: "my name is {{username}}"
+ - gen:
+ ref: t1.c1
- gen:
enum: [1, 2, 3]
@@ -394,34 +441,6 @@ columns:
enum: [2, 4, 6, 8, 10]
```
-##### ref
-
-引用生成器,随机使用其他表的列的值。
-一般在用于关系列之间,比如 `t1 JOIN t2 ON t1.c1 = t2.c1` 或 `WHERE t1.c1 = t2.c1`:
-
-```yaml
-columns:
- - name: t_int
- # format: "1{{%6d}}"
- gen:
- ref: employees.department_id
- limit: 100 # 随机选择 100 个值(默认 1000)
-
- - name: t_struct # struct
- fields:
- - name: dp_id
- gen:
- ref: employees.department_id
- - name: name
- gen:
- ref: employees.name
-```
-
-> [!IMPORTANT]
->
-> - 引用的源表必须一起生成
-> - 引用之间不能有死锁
-
##### type
使用其他类型的生成器,比如 `varchar` 的列用 `int` 类型生成:
diff --git a/introduction.md b/introduction.md
index 2122dc4..ee26994 100644
--- a/introduction.md
+++ b/introduction.md
@@ -17,9 +17,9 @@
- [format](#format)
- [gen](#gen)
- [inc](#inc)
+ - [ref](#ref)
- [enum](#enum)
- [parts](#parts)
- - [ref](#ref)
- [type](#type)
- [golang](#golang)
- [Complex Types](#complex-types-maparraystructjsonvariant)
@@ -106,7 +106,7 @@ output
### Other Dump Parameters
- `--analyze`: Automatically runs `ANALYZE TABLE WITH SYNC` before dumping a table to make statistics more accurate. Default is off.
-- `--parallel`: Controls the dump concurrency. Increasing it speeds up the dump; decreasing it uses fewer resources. Default is `min(machine_cores, 10)`.
+- `--parallel`: Controls the dump concurrency. Increasing it speeds up the dump; decreasing it uses fewer resources. Default is `min(machine_cores-2, 10)`.
- `--dump-stats`: Also dumps table statistics when dumping tables. Statistics are dump to `output/ddl/db.stats.yaml`. Default is on.
- `--only-select`: Whether to dump only `SELECT` statements. Default is on.
- `--from` and `--to`: Dump SQL within a specified time range.
@@ -183,7 +183,8 @@ In implementation, the tool performs these actions in two stages based on the `-
### Default Generation Rules
-By default, `NULL` values are not generated. This can be changed by specifying `null_frequency` in [Custom Generation Rules](#custom-generation-rules).
+- By default, `NULL` values are not generated. This can be changed by specifying `null_frequency` in [Custom Generation Rules](#custom-generation-rules)
+- Remember that the `string/text/varchar/char` letter is randomly generated, unpredictable, the charset is alphanumeric (a-z, A-Z, 0-9)
Default generation rules for various types:
@@ -207,11 +208,27 @@ Default generation rules for various types:
| DATE | | 10 years ago - now | |
| DATETIME | | 10 years ago - now | |
-- The `string` type letter is randomly generated, and the charset is alphanumeric (a-z, A-Z, 0-9)
-
### Custom Generation Rules
-When generating data, specify the configuration file using `--genconf gendata.yaml`. For a complete example, see [example/gendata.yaml](./example/gendata.yaml).
+Generate data using configuration files specified via `dodo gendata --genconf gendata.yaml`. For a full reference, see [example/gendata.yaml](./example/gendata.yaml).
+
+You can concatenate multiple `gendata.yaml` contents in one file (separated by `---`). It equals to call `dodo gendata --genconf ` multiple times. Example:
+
+```yaml
+# Dataset 1
+null_frequency: 0
+type:
+ ...
+tables:
+ ...
+---
+# Dataset 2
+null_frequency: 0.05
+type:
+ ...
+tables:
+ ...
+```
#### Global Rules vs. Table Rules
@@ -233,7 +250,7 @@ type:
max: 2025-06-12
```
-Example of table-level rules:
+Example of table-level rules, the columns that are not in the table rules will use the global default rules:
```yaml
tables:
@@ -339,6 +356,34 @@ columns:
start: 100 # Starts from 100 (default 1)
```
+##### ref
+
+Reference generator, randomly uses values from other `table.column`.
+Typically used for columns from different tables but has the same values, like relational columns `t1 JOIN t2 ON t1.c1 = t2.c1` or `WHERE t1.c1 = t2.c1`:
+
+```yaml
+columns:
+ - name: t_int
+ # format: "1{{%6d}}"
+ gen:
+ ref: employees.department_id
+ limit: 100 # Randomly select 100 values (default 1000)
+
+ - name: t_struct # struct
+ fields:
+ - name: dp_id
+ gen:
+ ref: employees.department_id # ref can be used in nested rules
+ - name: name
+ gen:
+ ref: employees.name
+```
+
+> [!IMPORTANT]
+>
+> - The source tables that be referenced to must be generated together
+> - The references must not have deadlock
+
##### enum
Enum generator (aka. `enums`), randomly selects from given values, values can be literals or generators (the type will be inferred from parent generator). There is an optional config `weights` (can only be used with `enum`):
@@ -352,11 +397,13 @@ columns:
- name: t_str
gen:
- # randomly choose one literal or generators to generate value, each has 25% probability
+ # randomly choose one literal or generators to generate value, each has 20% probability
enum:
- "123"
- length: {min: 5, max: 10}
- format: "my name is {{username}}"
+ - gen:
+ ref: t1.c1
- gen:
enum: [1, 2, 3]
@@ -370,9 +417,9 @@ columns:
##### parts
-Must be used together with [`format`](#format). Flexibly combine multiple values to produce the final result.
+Must be used together with [`format`](#format). Flexibly combine multiple values to produce the final result.
-`parts` generates multiple values at a time and fills them into `{{%xxx}}` of [`format`](#format) in order. The value of each part can be a literal or a generator(the type will be inferred from parent generator):
+`parts` generates multiple values at a time and fills them into `{{%xxx}}` of [`format`](#format) in order. The value of each part can be a literal or a generator(the type will be inferred from parent generator):
```yaml
columns:
@@ -385,9 +432,7 @@ columns:
min: 1
max: 12
- gen: # day
- type: int
- min: 1
- max: 20
+ ref: table1.column1
- name: t_null_char # char(10)
format: "{{%s}}--{{%02d}}" # parts must be used with format
@@ -398,34 +443,6 @@ columns:
enum: [2, 4, 6, 8, 10]
```
-##### ref
-
-Reference generator, randomly uses values from other `table.column`.
-Typically used for relational columns, like `t1 JOIN t2 ON t1.c1 = t2.c1` or `WHERE t1.c1 = t2.c1`:
-
-```yaml
-columns:
- - name: t_int
- # format: "1{{%6d}}"
- gen:
- ref: employees.department_id
- limit: 100 # Randomly select 100 values (default 1000)
-
- - name: t_struct # struct
- fields:
- - name: dp_id
- gen:
- ref: employees.department_id
- - name: name
- gen:
- ref: employees.name
-```
-
-> [!IMPORTANT]
->
-> - The source tables that be referenced to must be generated together
-> - The references must not have deadlock
-
##### type
Uses the generator of another type. For example, generating values for a `varchar` column using an `int` type generator:
@@ -447,15 +464,12 @@ columns:
- name: t_varchar2
gen:
type: struct
- # fields: # Optional: Define rules for foo and bar if needed
- # - name: foo
- # gen:
- # inc: 1
- # start: 1000
```
##### golang
+P.s. This feature should be used exclusively as a last resort due to its poor readability - strongly consider using alternative functionality instead.
+
Uses Go code for a custom generator, supports Go stdlib:
```yaml
diff --git a/src/anonymizer.go b/src/anonymizer.go
index 0a796fe..6c43dff 100644
--- a/src/anonymizer.go
+++ b/src/anonymizer.go
@@ -45,13 +45,13 @@ func SetupAnonymizer(method, hashdictPath string, idMinLength int, reserveIds ..
if method == "minihash" {
b, err := os.OpenFile(hashdictPath, os.O_RDONLY|os.O_CREATE, 0600)
if err != nil {
- logrus.Fatalf("Failed to read hash dict file %s, err: %v\n", hashdictPath, err)
+ logrus.Fatalf("Failed to read hash dict file %s, err: %v", hashdictPath, err)
}
defer b.Close()
miniDict = make(map[string]string)
if err = yaml.NewDecoder(b).Decode(&miniDict); err != nil && err != io.EOF {
- logrus.Fatalf("Failed to decode hash dict file %s, err: %v\n", hashdictPath, err)
+ logrus.Fatalf("Failed to decode hash dict file %s, err: %v", hashdictPath, err)
}
parser.DorisLexerInit()
@@ -70,19 +70,19 @@ func StoreMiniHashDict(method, hashdictPath string) {
newPath := hashdictPath + ".new"
b, err := os.OpenFile(newPath, os.O_WRONLY|os.O_CREATE, 0600)
if err != nil {
- logrus.Errorf("Failed to store hash dict file, err: %v\n", err)
+ logrus.Errorf("Failed to store hash dict file, err: %v", err)
return
}
if err = yaml.NewEncoder(b).Encode(miniDict); err != nil {
_ = b.Close()
- logrus.Errorf("Failed to encode hash dict file, err: %v\n", err)
+ logrus.Errorf("Failed to encode hash dict file, err: %v", err)
return
}
_ = b.Close()
if err = os.Rename(newPath, hashdictPath); err != nil {
- logrus.Errorf("Failed to replace hash dict file, err: %v\n", err)
+ logrus.Errorf("Failed to replace hash dict file, err: %v", err)
}
}
@@ -147,7 +147,7 @@ func getAnonymizeFunc(method string) func(string) string {
case "minihash":
return minihashF
default:
- logrus.Warnf("Anonymization method %s is not supported, keep going with no anonymization\n", method)
+ logrus.Warnf("Anonymization method %s is not supported, keep going with no anonymization", method)
return nil
}
}
diff --git a/src/auditlog.go b/src/auditlog.go
index 3156ccb..9b6e24b 100644
--- a/src/auditlog.go
+++ b/src/auditlog.go
@@ -92,7 +92,7 @@ func ExtractQueriesFromAuditLogs(
opts AuditLogScanOpts,
parallel int,
) (int, error) {
- logrus.Infof("Extracting queries of database %v, audit logs: %v\n", opts.DBs, auditlogPaths)
+ logrus.Infof("Extracting queries of database %v, audit logs: %v", opts.DBs, auditlogPaths)
g := ParallelGroup(parallel)
diff --git a/src/create.go b/src/create.go
index d6bf285..88c1e14 100644
--- a/src/create.go
+++ b/src/create.go
@@ -75,7 +75,7 @@ func RunCreateSQL(ctx context.Context, conn *sqlx.DB, db string, sqlFile string,
interval := antlr.NewInterval(s.GetStart().GetTokenIndex(), s.GetStop().GetTokenIndex())
stmt := p.GetTokenStream().GetTextFromInterval(interval)
- logrus.Tracef("creating schema in db %s, sql: %s\n", db, stmt)
+ logrus.Tracef("creating schema in db %s, sql: %s", db, stmt)
if dryrun {
return "", nil
}
@@ -97,7 +97,7 @@ func RunCreateSQL(ctx context.Context, conn *sqlx.DB, db string, sqlFile string,
if err != nil {
if strings.Contains(err.Error(), " already exists") {
- logrus.Infof("skip creating %s '%s.%s', already exists\n", schemaType.Lower(), db, name)
+ logrus.Infof("skip creating %s '%s.%s', already exists", schemaType.Lower(), db, name)
continue
} else if strings.Contains(err.Error(), " does not exist") {
// may deppends on other table/view
diff --git a/src/db.go b/src/db.go
index 1ffe554..406256c 100644
--- a/src/db.go
+++ b/src/db.go
@@ -366,7 +366,7 @@ func GetTablesStats(ctx context.Context, conn *sqlx.DB, analyze bool, dbname str
s, err := getTableStats(ctx, conn, dbname, table)
if err != nil {
- logrus.Errorf("get table stats failed: db: %s, table: %s, err: %v\n", dbname, table, err)
+ logrus.Errorf("get table stats failed: db: %s, table: %s, err: %v", dbname, table, err)
return nil, err
}
if s == nil {
@@ -379,11 +379,11 @@ func GetTablesStats(ctx context.Context, conn *sqlx.DB, analyze bool, dbname str
}
func analyzeTableSync(ctx context.Context, conn *sqlx.DB, dbname, table string) {
- logrus.Debugf("analyzing table `%s`.`%s` with sync\n", dbname, table)
+ logrus.Debugf("analyzing table `%s`.`%s` with sync", dbname, table)
r, err := conn.QueryxContext(ctx, InternalSqlComment+fmt.Sprintf("ANALYZE TABLE `%s`.`%s` WITH SYNC", dbname, table))
if err != nil {
- logrus.Errorf("Analyze table `%s`.`%s` failed, err: %v\n", dbname, table, err)
+ logrus.Errorf("Analyze table `%s`.`%s` failed, err: %v", dbname, table, err)
}
defer r.Close()
}
@@ -432,7 +432,7 @@ func getTableStats(ctx context.Context, conn *sqlx.DB, dbname, table string) (*T
return nil, err
}
if len(cols) == 0 {
- logrus.Warnf("no column stats found for %s.%s\n", dbname, table)
+ logrus.Warnf("no column stats found for %s.%s", dbname, table)
return nil, nil
}
@@ -483,7 +483,7 @@ func GetDBAuditLogs(
}
}
- logrus.Debugf("need to scan %d audit log row(s)\n", total)
+ logrus.Debugf("need to scan %d audit log row(s)", total)
if parallel > total {
parallel = total
@@ -591,7 +591,7 @@ func getDBAuditLogsWithConds(
var r *sqlx.Rows
r, err = db.QueryxContext(ctx, InternalSqlComment+stmt)
if err != nil {
- logrus.Errorf("query audit log table failed: retry: %d, db: %s, table: %s, err: %v\n", retry, dbname, table, err)
+ logrus.Errorf("query audit log table failed: retry: %d, db: %s, table: %s, err: %v", retry, dbname, table, err)
continue
}
defer r.Close() //nolint:revive
@@ -610,7 +610,7 @@ func getDBAuditLogsWithConds(
vals, err = cast.ToStringSliceE(vals_)
if err != nil {
- logrus.Errorf("read audit log table failed: db: %s, table: %s, err: %v\n", dbname, table, err)
+ logrus.Errorf("read audit log table failed: db: %s, table: %s, err: %v", dbname, table, err)
break
}
lastTime, lastQueryId = vals[0], vals[5]
diff --git a/src/export.go b/src/export.go
index ae87d7f..716a369 100644
--- a/src/export.go
+++ b/src/export.go
@@ -73,7 +73,7 @@ func Export(
if completed {
return nil
}
- logrus.Debugf("Exporting table '%s.%s', progress: %s\n", dbname, table, progress)
+ logrus.Debugf("Exporting table '%s.%s', progress: %s", dbname, table, progress)
}
return fmt.Errorf("export table '%s.%s' timed out after %d seconds", dbname, table, timeout)
diff --git a/src/gendata.go b/src/gendata.go
index 167865d..f5ffc95 100644
--- a/src/gendata.go
+++ b/src/gendata.go
@@ -18,7 +18,10 @@ const (
GenDataFileFirstLinePrefix = "columns:" // optional first line prefix if stream load needs 'columns: xxx' header
)
-type GenRule = gen.GenRule
+type (
+ GenRule = gen.GenRule
+ GenconfEndError = gen.GenconfEndError
+)
func NewTableGen(ddlfile, createTableStmt string, stats *TableStats, rows int, streamloadColNames []string) (*TableGen, error) {
// parse create-table statement
@@ -42,9 +45,9 @@ func NewTableGen(ddlfile, createTableStmt string, stats *TableStats, rows int, s
s.Count = stats.RowCount
return s.Name, s
})
- logrus.Debugf("using stats for table '%s'\n", table)
+ logrus.Debugf("using stats for table '%s'", table)
} else {
- logrus.Debugf("stats not found for table '%s'\n", table)
+ logrus.Debugf("stats not found for table '%s'", table)
}
// get custom table gen rule
diff --git a/src/gendata_test.go b/src/gendata_test.go
index c6007ba..ce45705 100644
--- a/src/gendata_test.go
+++ b/src/gendata_test.go
@@ -12,7 +12,7 @@ import (
)
func init() {
- generator.Setup("")
+ generator.Setup("", 0)
}
func TestGendata(t *testing.T) {
diff --git a/src/generator/format.go b/src/generator/format.go
index 34b0984..bdac1fd 100644
--- a/src/generator/format.go
+++ b/src/generator/format.go
@@ -61,7 +61,7 @@ func (g *FormatGen) Gen() any {
return WriteColVal(w.(ColValWriter), result)
})
if err != nil {
- logrus.Errorf("format execute templace failed, err: %v\n", err)
+ logrus.Errorf("format execute templace failed, err: %v", err)
}
return formatted
diff --git a/src/generator/generator.go b/src/generator/generator.go
index 7e02db5..ea6b99a 100644
--- a/src/generator/generator.go
+++ b/src/generator/generator.go
@@ -30,13 +30,12 @@ func init() {
}
}
-func Setup(genconf string) error {
+func Setup(genconf string, confIdx int) error {
SetupFormatTags()
- return SetupGenRules(genconf)
+ return SetupGenRules(genconf, confIdx)
}
type GenRule = map[string]any
-
type Gen interface {
Gen() any
}
@@ -72,7 +71,7 @@ func (v *TypeVisitor) GetGen(type_ parser.IDataTypeContext) Gen {
// Merge global (aka. default) generation rules.
v.MergeDefaultRule(baseType)
if logrus.GetLevel() > logrus.DebugLevel {
- logrus.Tracef("gen rule of '%s': %s\n", v.Colpath, string(MustJSONMarshal(v.GenRule)))
+ logrus.Tracef("gen rule of '%s': %s", v.Colpath, string(MustJSONMarshal(v.GenRule)))
}
if customGenRule, ok := v.GetRule("gen").(GenRule); ok {
@@ -87,10 +86,10 @@ func (v *TypeVisitor) GetGen(type_ parser.IDataTypeContext) Gen {
if format, ok := v.GetRule("format").(string); ok && format != "" {
g, err = NewFormatGenerator(format, g)
if err != nil {
- logrus.Fatalf("The format rule '%s' of column '%s' compile failed, err: %v\n", format, v.Colpath, err)
+ logrus.Fatalf("The format rule '%s' of column '%s' compile failed, err: %v", format, v.Colpath, err)
}
} else if _, ok := g.(*PartsGen); ok {
- logrus.Fatalf("Parts generator cannot be used without format rule, please add 'format' rule for column '%s'\n", v.Colpath)
+ logrus.Fatalf("Parts generator cannot be used without format rule, please add 'format' rule for column '%s'", v.Colpath)
}
// null generator
@@ -118,17 +117,17 @@ func (v *TypeVisitor) getCustomGen(type_ parser.IDataTypeContext, customGenRule
continue
}
if g != nil {
- logrus.Fatalf("Multiple custom generators found for column '%s', only one is allowed, but got both: %s and %s\n", v.Colpath, genName, name)
+ logrus.Fatalf("Multiple custom generators found for column '%s', only one is allowed, but got both: %s and %s", v.Colpath, genName, name)
}
g, err = newCustomGen(v, type_, customGenRule)
if err != nil {
- logrus.Fatalf("Invalid custom generator '%s' for column '%s', err: %v\n", name, v.Colpath, err)
+ logrus.Fatalf("Invalid custom generator '%s' for column '%s', err: %v", name, v.Colpath, err)
}
genName = name
}
if g == nil {
- logrus.Fatalf("Custom generator not found for column '%s', expect one of %v\n",
+ logrus.Fatalf("Custom generator not found for column '%s', expect one of %v",
v.Colpath,
lo.MapToSlice(CustomGenConstructors, func(name string, _ CustomGenConstructor) string { return name }),
)
@@ -151,7 +150,7 @@ func (v *TypeVisitor) getTypeGen(type_ parser.IDataTypeContext, baseType string)
// Handle map type
kv := ty.AllDataType()
if len(kv) != 2 {
- logrus.Fatalf("Invalid map type: '%s' for column '%s', expected 2 types for key and value\n", ty.GetText(), v.Colpath)
+ logrus.Fatalf("Invalid map type: '%s' for column '%s', expected 2 types for key and value", ty.GetText(), v.Colpath)
}
// Handle key-value pair in map
@@ -172,7 +171,7 @@ func (v *TypeVisitor) getTypeGen(type_ parser.IDataTypeContext, baseType string)
fieldRules, ok := fields_.([]any) // Ensure fields is a slice of maps
if !ok {
if fields_ != nil {
- logrus.Fatalf("Invalid struct fields type '%T' for column '%s'\n", fields_, v.Colpath)
+ logrus.Fatalf("Invalid struct fields type '%T' for column '%s'", fields_, v.Colpath)
}
fieldRules = lo.ToAnySlice([]GenRule{})
}
@@ -180,11 +179,11 @@ func (v *TypeVisitor) getTypeGen(type_ parser.IDataTypeContext, baseType string)
fields := lo.SliceToMap(fieldRules, func(field_ any) (string, GenRule) {
field, ok := field_.(GenRule)
if !ok {
- logrus.Fatalf("Invalid struct field #%d in column '%s'\n", i, v.Colpath)
+ logrus.Fatalf("Invalid struct field #%d in column '%s'", i, v.Colpath)
}
fieldName, ok := field["name"].(string)
if !ok {
- logrus.Fatalf("Struct field #%d has no name in column '%s'\n", i, v.Colpath)
+ logrus.Fatalf("Struct field #%d has no name in column '%s'", i, v.Colpath)
}
i++
return fieldName, field
@@ -196,7 +195,7 @@ func (v *TypeVisitor) getTypeGen(type_ parser.IDataTypeContext, baseType string)
}
g = g_
default:
- logrus.Fatalf("Unsupported complex type: '%s' for column '%s'\n", ty.GetComplex_().GetText(), v.Colpath)
+ logrus.Fatalf("Unsupported complex type: '%s' for column '%s'", ty.GetComplex_().GetText(), v.Colpath)
}
case *parser.PrimitiveDataTypeContext:
min_, max_ := v.GetMinMax()
@@ -218,13 +217,13 @@ func (v *TypeVisitor) getTypeGen(type_ parser.IDataTypeContext, baseType string)
genRule = maps.Clone(v.GenRule)
delete(genRule, "structure")
} else {
- logrus.Fatalf("JSON/JSONB/VARIANT must have gen rule 'structure' or 'gen' at column '%s'\n", v.Colpath)
+ logrus.Fatalf("JSON/JSONB/VARIANT must have gen rule 'structure' or 'gen' at column '%s'", v.Colpath)
}
p := parser.NewParser(v.Colpath, structure)
dataType := p.DataType()
if err := p.ErrListener.LastErr; err != nil {
- logrus.Fatalf("Invalid JSON structure '%s' for column '%s': %v\n", structure, v.Colpath, err)
+ logrus.Fatalf("Invalid JSON structure '%s' for column '%s': %v", structure, v.Colpath, err)
}
visitor := NewTypeVisitor(v.Colpath, genRule)
g = visitor.GetGen(dataType)
@@ -271,17 +270,17 @@ func (v *TypeVisitor) getTypeGen(type_ parser.IDataTypeContext, baseType string)
}
if precision > p {
precision = p
- // logrus.Debugf("Precision '%d' is larger than the defined precision '%d' for column '%s', using %d instead\n", precision, p, v.Colpath, p)
+ // logrus.Debugf("Precision '%d' is larger than the defined precision '%d' for column '%s', using %d instead", precision, p, v.Colpath, p)
}
if len(intVals) > 1 {
s = cast.ToInt(intVals[1].GetText())
}
if s < 0 || s > precision {
- // logrus.Debugf("Scale '%d' is invalid for precision '%d' in column '%s', using 0 instead\n", s, precision, v.Colpath)
+ // logrus.Debugf("Scale '%d' is invalid for precision '%d' in column '%s', using 0 instead", s, precision, v.Colpath)
s = 0
}
if scale > s {
- // logrus.Debugf("Scale '%d' is larger than the defined scale '%d' for column '%s', using %d instead\n", scale, s, v.Colpath, s)
+ // logrus.Debugf("Scale '%d' is larger than the defined scale '%d' for column '%s', using %d instead", scale, s, v.Colpath, s)
scale = s
}
@@ -358,7 +357,7 @@ func (v *TypeVisitor) getTypeGen(type_ parser.IDataTypeContext, baseType string)
case "CHAR":
length_ := ty.INTEGER_VALUE(0)
if length_ == nil {
- logrus.Fatalf("CHAR type must have a length in column '%s'\n", v.Colpath)
+ logrus.Fatalf("CHAR type must have a length in column '%s'", v.Colpath)
}
length := min(max(1, cast.ToInt(length_.GetText())), 255)
g = NewFuncGen(func() any { return RandomStr(length, length) })
@@ -370,7 +369,7 @@ func (v *TypeVisitor) getTypeGen(type_ parser.IDataTypeContext, baseType string)
// skip gen HLL
g = NewFuncGen(func() any { return "" })
default: // TODO: AGG_STATE, QUANTILE_STATE
- logrus.Fatalf("Unsupported column type '%s' for column '%s'\n", type_.GetText(), v.Colpath)
+ logrus.Fatalf("Unsupported column type '%s' for column '%s'", type_.GetText(), v.Colpath)
}
}
return g
@@ -383,7 +382,7 @@ func (v *TypeVisitor) GetBaseType(type_ parser.IDataTypeContext) (t string) {
case *parser.PrimitiveDataTypeContext:
t = ty.PrimitiveColType().GetType_().GetText()
default:
- logrus.Fatalf("Unsupported column type '%s' for column '%s'\n", type_.GetText(), v.Colpath)
+ logrus.Fatalf("Unsupported column type '%s' for column '%s'", type_.GetText(), v.Colpath)
}
return strings.ToUpper(t)
}
@@ -432,7 +431,7 @@ func (v *TypeVisitor) GetMinMax() (any, any) {
func (v *TypeVisitor) GetLength() (minVal, maxVal int) {
l := v.GetRule("length")
if l == nil {
- logrus.Fatalf("length not found for column '%s'\n", v.Colpath)
+ logrus.Fatalf("length not found for column '%s'", v.Colpath)
}
switch l := l.(type) {
@@ -443,7 +442,7 @@ func (v *TypeVisitor) GetLength() (minVal, maxVal int) {
minVal, maxVal = cast.ToInt(l["min"]), cast.ToInt(l["max"])
}
if maxVal < minVal {
- logrus.Debugf("length max(%d) < min(%d), set max to min for column '%s'\n", maxVal, minVal, v.Colpath)
+ logrus.Debugf("length max(%d) < min(%d), set max to min for column '%s'", maxVal, minVal, v.Colpath)
minVal = maxVal
}
return
@@ -475,7 +474,7 @@ func (v *TypeVisitor) GetChildGen(name string, childType parser.IDataTypeContext
func (v *TypeVisitor) GetNullFrequency() float32 {
nullFrequency, err := cast.ToFloat32E(v.GetRule("null_frequency", GLOBAL_NULL_FREQUENCY))
if err != nil || nullFrequency < 0 || nullFrequency > 1 {
- logrus.Fatalf("Invalid null frequency '%v' for column '%s': %v\n", v.GetRule("null_frequency"), v.Colpath, err)
+ logrus.Fatalf("Invalid null frequency '%v' for column '%s': %v", v.GetRule("null_frequency"), v.Colpath, err)
}
return nullFrequency
}
diff --git a/src/generator/generator_defaults.go b/src/generator/generator_defaults.go
index dfa61ff..07d0304 100644
--- a/src/generator/generator_defaults.go
+++ b/src/generator/generator_defaults.go
@@ -1,6 +1,8 @@
package generator
import (
+ "errors"
+ "io"
"math"
"os"
"strconv"
@@ -13,12 +15,10 @@ import (
"gopkg.in/yaml.v3"
)
+const GLOBAL_NULL_FREQUENCY = 0.0 // Default null frequency is 0%
+
var (
- GlobalGenRule = GenRule{
- "null_frequency": GLOBAL_NULL_FREQUENCY,
- }
- GLOBAL_NULL_FREQUENCY = 0.0 // Default null frequency is 0%
- MAX_DECIMAL_INT_LEN = len(strconv.FormatInt(math.MaxInt64, 10))
+ MAX_DECIMAL_INT_LEN = len(strconv.FormatInt(math.MaxInt64, 10))
TypeAlias = map[string]string{
"INTEGER": "INT",
@@ -33,7 +33,48 @@ var (
"TIMESTAMP": "DATETIME",
}
- DefaultTypeGenRules = lo.MapValues(map[string]GenRule{
+ globalGenRule GenRule
+ DefaultTypeGenRules map[string]any
+)
+
+type GenconfEndError struct{}
+
+func (*GenconfEndError) Error() string {
+ return "genconf ended"
+}
+
+func newGlobalGenRule(configFile string, configIdx int) (GenRule, error) {
+ genrule := make(GenRule)
+ if configFile != "" {
+ f, err := os.Open(configFile)
+ if err != nil {
+ return nil, err
+ }
+
+ d := yaml.NewDecoder(f)
+ for i := 0; i <= configIdx; i++ {
+ genrule = make(GenRule)
+ if err := d.Decode(&genrule); err != nil {
+ if errors.Is(err, io.EOF) {
+ return nil, &GenconfEndError{}
+ }
+ return nil, err
+ }
+ }
+ }
+ if genrule == nil {
+ // maybe an empty YAML
+ genrule = make(GenRule)
+ }
+ genrule["null_frequency"] = GLOBAL_NULL_FREQUENCY
+ if g, ok := genrule["type"]; !ok || g == nil {
+ genrule["type"] = GenRule{}
+ }
+ return genrule, nil
+}
+
+func newDefaultTypeGenRules() map[string]any {
+ return lo.MapValues(map[string]GenRule{
"ARRAY": {
"length": GenRule{
"min": 1,
@@ -119,28 +160,24 @@ var (
"max": time.Now(),
},
}, func(v GenRule, _ string) any { return v })
-)
+}
-func SetupGenRules(configFile string) error {
- if configFile != "" {
- b, err := os.ReadFile(configFile)
- if err != nil {
- return err
- }
- if err := yaml.Unmarshal(b, &GlobalGenRule); err != nil {
- return err
- }
- }
- if g, ok := GlobalGenRule["type"]; !ok || g == nil {
- GlobalGenRule["type"] = GenRule{}
+func SetupGenRules(configFile string, configIdx int) (err error) {
+ // init GlobalGenRule and DefaultTypeGenRules
+ globalGenRule, err = newGlobalGenRule(configFile, configIdx)
+ if err != nil {
+ return err
}
- typeGenRules := lo.MapEntries(GlobalGenRule["type"].(GenRule), func(ty string, g any) (string, any) {
+ DefaultTypeGenRules = newDefaultTypeGenRules()
+
+ // merge GlobalGenRule["type"] into default type gen rules
+ typeGenRules := lo.MapEntries(globalGenRule["type"].(GenRule), func(ty string, g any) (string, any) {
if g == nil {
g = GenRule{}
}
genRule, ok := g.(GenRule)
if !ok {
- logrus.Fatalf("Type gen rule for '%s' should be a map, but got '%T'\n", ty, g)
+ logrus.Fatalf("Type gen rule for '%s' should be a map, but got '%T'", ty, g)
}
return strings.ToUpper(ty), genRule
})
@@ -153,7 +190,7 @@ func SetupGenRules(configFile string) error {
panic("Default type gen rule should be a map")
}
if r, ok := genRule["null_frequency"]; !ok || r == nil {
- genRule["null_frequency"] = GlobalGenRule["null_frequency"]
+ genRule["null_frequency"] = globalGenRule["null_frequency"]
}
}
@@ -164,9 +201,9 @@ func GetCustomTableGenRule(table string) (rows int, colrules map[string]GenRule)
tableParts := strings.Split(table, ".")
tablePart := tableParts[len(tableParts)-1]
- g, ok := GlobalGenRule["tables"].([]any)
+ g, ok := globalGenRule["tables"].([]any)
if !ok || len(g) == 0 {
- logrus.Debugf("no custom gen rule for table '%s'\n", table)
+ logrus.Debugf("no custom gen rule for table '%s'", table)
return 0, map[string]GenRule{}
}
@@ -178,7 +215,7 @@ func GetCustomTableGenRule(table string) (rows int, colrules map[string]GenRule)
return tg["name"] == tablePart
})
if !found {
- logrus.Debugf("no custom gen rule for table '%s'\n", table)
+ logrus.Debugf("no custom gen rule for table '%s'", table)
return 0, map[string]GenRule{}
}
tg := tg_.(GenRule) //nolint:revive
@@ -189,7 +226,7 @@ func GetCustomTableGenRule(table string) (rows int, colrules map[string]GenRule)
// get table columns gen rule
cgs, ok := tg["columns"].([]any)
if !ok || len(cgs) == 0 {
- logrus.Debugf("no custom gen rule for table columns '%s'\n", table)
+ logrus.Debugf("no custom gen rule for table columns '%s'", table)
return 0, map[string]GenRule{}
}
@@ -202,7 +239,7 @@ func GetCustomTableGenRule(table string) (rows int, colrules map[string]GenRule)
name, ok := cg["name"].(string)
if !ok {
- logrus.Fatalf("Column field #%d has no name in table '%s'\n", i, table)
+ logrus.Fatalf("Column field #%d has no name in table '%s'", i, table)
}
i++
return name, cg
diff --git a/src/generator/misc.go b/src/generator/misc.go
index c8bd471..01846e3 100644
--- a/src/generator/misc.go
+++ b/src/generator/misc.go
@@ -94,7 +94,7 @@ func CastMinMax[R int8 | int16 | int | int32 | int64 | float32 | float64 | time.
minBigger = any(maxVal).(time.Time).Before(any(minVal).(time.Time))
}
if minBigger {
- logrus.Warnf("Column '%s' max(%v) < min(%v), set max to min\n", colpath, maxVal, minVal)
+ logrus.Warnf("Column '%s' max(%v) < min(%v), set max to min", colpath, maxVal, minVal)
maxVal = minVal
}
return minVal, maxVal
diff --git a/src/importdata.go b/src/importdata.go
index 5be460c..e940fe3 100644
--- a/src/importdata.go
+++ b/src/importdata.go
@@ -21,7 +21,7 @@ const (
func StreamLoad(ctx context.Context, host, httpPort, user, password, db, table, file, fileProgress string, dryrun bool) error {
f, err := os.Open(file)
if err != nil {
- logrus.Errorf("Open data file '%s' failed\n", file)
+ logrus.Errorf("Open data file '%s' failed", file)
return err
}
r := bufio.NewReader(f)
@@ -47,7 +47,7 @@ func StreamLoad(ctx context.Context, host, httpPort, user, password, db, table,
curl += fmt.Sprintf(" -T '%s'", file)
sanitizedCurl := strings.Replace(curl, userpass, fmt.Sprintf("%s:****", user), 1)
- logrus.Infof("Stream load %s.%s (%s)\n", db, table, fileProgress)
+ logrus.Infof("Stream load %s.%s (%s)", db, table, fileProgress)
logrus.Debugln(sanitizedCurl)
if dryrun {
@@ -68,7 +68,7 @@ func StreamLoad(ctx context.Context, host, httpPort, user, password, db, table,
result := make(map[string]any)
if err_ := json.Unmarshal(stdout, &result); err_ != nil {
- logrus.Errorf("Stream load get result failed for '%s.%s' at data file '%s'\n", db, table, file)
+ logrus.Errorf("Stream load get result failed for '%s.%s' at data file '%s'", db, table, file)
return errors.New("stream load failed")
}
if status, ok := result["Status"]; !ok || status.(string) != "Success" {
@@ -80,7 +80,7 @@ func StreamLoad(ctx context.Context, host, httpPort, user, password, db, table,
msg = result["data"]
}
details := result["ErrorURL"]
- logrus.Errorf("Stream load failed for '%s.%s' at data file '%s', message: %v, details: %v\n", db, table, file, msg, details)
+ logrus.Errorf("Stream load failed for '%s.%s' at data file '%s', message: %v, details: %v", db, table, file, msg, details)
return errors.New("stream load failed")
}
diff --git a/src/llm.go b/src/llm.go
index 34f38b5..7bdd253 100644
--- a/src/llm.go
+++ b/src/llm.go
@@ -50,7 +50,7 @@ func LLMGendataConfig(
%s
- `,
+`,
strings.Join(tables, "\n"),
strings.Join(columnStats, "\n---\n"),
strings.Join(sqls, "\n"))
@@ -60,7 +60,7 @@ func LLMGendataConfig(
%s
- `, userPrompt, prompt_)
+`, userPrompt, prompt_)
}
logrus.Debugln("LLM user prompt:", userPrompt)
@@ -75,7 +75,8 @@ func LLMGendataConfig(
stop.SetExtraFields(map[string]any{"prefix": true})
c := client.Chat.Completions.NewStreaming(ctx, openai.ChatCompletionNewParams{
Model: model,
- Temperature: openai.Float(0.3),
+ Temperature: openai.Float(0.1),
+ TopP: openai.Float(0.9),
Stop: stop,
Messages: []openai.ChatCompletionMessageParamUnion{
openai.SystemMessage(prompt.Gendata),
diff --git a/src/llm_test.go b/src/llm_test.go
index 3f0b4ef..d479b92 100644
--- a/src/llm_test.go
+++ b/src/llm_test.go
@@ -10,6 +10,7 @@ import (
"gopkg.in/yaml.v3"
)
+// go test -timeout 600s -run ^TestLLMGendataConfig$ github.com/Thearas/dodo/src
func TestLLMGendataConfig(t *testing.T) {
apikey := os.Getenv("DORIS_DEEPSEEK_API_KEY")
if apikey == "" {
@@ -120,8 +121,9 @@ CREATE TABLE ga (
},
columnStats: []string{},
sqls: []string{`
-SELECT ha.ia,
- ha.ja + ka.la as value
+SELECT
+ ha.ia,
+ ha.ja as value
from (
select ma.ia ia,
(
@@ -180,11 +182,7 @@ from (
and date_add('2020-05-25', INTERVAL 1 DAY)
) ma on ma.ia = oa.wa
where oa.wa is not null
- ) ha,
- (
- select COALESCE (max (va), 0) la
- from c
- ) ka
+ ) ha
order by 1,2;
`},
},
@@ -193,7 +191,7 @@ order by 1,2;
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- got, err := LLMGendataConfig(tt.args.ctx, tt.args.apiKey, tt.args.baseURL, "deepseek-chat", "每张表 10w 行", tt.args.tables, tt.args.columnStats, tt.args.sqls)
+ got, err := LLMGendataConfig(tt.args.ctx, tt.args.apiKey, tt.args.baseURL, "deepseek-chat", "每张事实表 1w 行,维度表 100 行", tt.args.tables, tt.args.columnStats, tt.args.sqls)
if (err != nil) != tt.wantErr {
t.Errorf("LLMGendataConfig() error = %v, wantErr %v", err, tt.wantErr)
return
@@ -205,7 +203,7 @@ order by 1,2;
assert.IsType(t, []any{}, gotData["tables"])
assert.True(t, lo.ContainsBy(gotData["tables"].([]any), func(item any) bool {
table := item.(map[string]any)
- return table["name"] == "c" && table["row_count"] == 100_000
+ return table["name"] == "c" && table["row_count"].(int) >= 100
}))
})
}
diff --git a/src/parser/custom.go b/src/parser/custom.go
index 64ad4c5..ad68935 100644
--- a/src/parser/custom.go
+++ b/src/parser/custom.go
@@ -93,7 +93,7 @@ func (l *ErrListener) SyntaxError(_ antlr.Recognizer, _ any, line, column int, m
// remove string after 'expecting', it's too annoying
msg := strings.Split(message, "expecting")[0]
l.LastErr = errors.New(msg)
- logrus.Errorf("sql %s parse error at line %d:%d %s\n", l.sqlId, line, column, msg)
+ logrus.Errorf("sql %s parse error at line %d:%d %s", l.sqlId, line, column, msg)
}
type listener struct {
diff --git a/src/prompt/gendata.xml b/src/prompt/gendata.xml
index 60c10a2..9ed64d6 100644
--- a/src/prompt/gendata.xml
+++ b/src/prompt/gendata.xml
@@ -5,11 +5,12 @@ Your task is generating YAML configurations for the data generation tool dodo (u
-1. The generated data must be able to be inserted into the tables in user prompt, constraints like UNIQUE KEY and PARTITIONS must be satisfied
-2. The generated data must be able to be queried by user's queries
-3. The YAML configurations should according to 'usage' below. Do not use rule key in `gendata.yaml` that haven't been documented in 'usage'
+1. The generated data must be able to be queried by user's queries
+2. The generated data must be able to be inserted into the tables in user prompt, constraints like UNIQUE KEY and PARTITIONS must be satisfied
+3. The YAML configurations should according to 'usage' below. Do not use generation rules that haven't been documented
4. When column stats conflict with queries conditions, prioritize queries conditions and ignore column stats
-5. Output should be a valid YAML and do not output anything else except YAML
+5. No need to generate rules for columns that are not used in queries conditions (like JOIN and WHERE)
+6. Output should be a valid YAML and do not output anything else except YAML
@@ -21,7 +22,8 @@ Learn the usage of `gendata` command below (document and examples) of tool `dodo
### Default Generation Rules
-By default, `NULL` values are not generated. This can be changed by specifying `null_frequency` in [Custom Generation Rules](#custom-generation-rules).
+- By default, `NULL` values are not generated. This can be changed by specifying `null_frequency` in [Custom Generation Rules]
+- Remember that the `string/text/varchar/char` letter is randomly generated, unpredictable, the charset is alphanumeric (a-z, A-Z, 0-9)
Default generation rules for various types:
@@ -45,11 +47,27 @@ Default generation rules for various types:
| DATE | | 10 years ago - now | |
| DATETIME | | 10 years ago - now | |
-- The `string` type letter is randomly generated, and the charset is alphanumeric (a-z, A-Z, 0-9)
-
### Custom Generation Rules
-When generating data, specify the configuration file using `--genconf gendata.yaml`. For a complete example, see [example/gendata.yaml](./example/gendata.yaml).
+Generate data using configuration files specified via `dodo gendata --genconf gendata.yaml`. For a full reference, see [example/gendata.yaml].
+
+You can concatenate multiple `gendata.yaml` contents in one file (separated by `---`). It equals to call `dodo gendata --genconf ` multiple times. Example:
+
+```yaml
+# Dataset 1
+null_frequency: 0
+type:
+ ...
+tables:
+ ...
+---
+# Dataset 2
+null_frequency: 0.05
+type:
+ ...
+tables:
+ ...
+```
#### Global Rules vs. Table Rules
@@ -71,7 +89,7 @@ type:
max: 2025-06-12
```
-Example of table-level rules:
+Example of table-level rules, the columns that are not in the table rules will use the global default rules:
```yaml
tables:
@@ -136,8 +154,8 @@ columns:
No matter what generation rule, there always can have a `format`, which will run after the column data generation, generate a string basing on the template, and then output it to CSV file. There're two types of tags (aka. placeholders) can be used in `format`:
-1. Format the return value of the column, such as `{{%s}}` or `{{%d}}`, etc., with the same syntax as Go's `fmt.Sprintf()`. There can only be one such label in a `format` (except using [`parts`](#parts)).
-2. Built-in tags such as `{{month}}`, `{{year}}`, etc, all built-in tags can be found in: [src/generator/README.md](./src/generator/README.md#format-tags).
+1. Format the return value of the column, such as `{{%s}}` or `{{%d}}`, etc., with the same syntax as Go's `fmt.Sprintf()`. There can only be one such label in a `format` (except using [`parts`].
+2. Built-in tags such as `{{month}}`, `{{year}}`, etc, all built-in tags can be found in: [src/generator/README.md].
For example:
@@ -177,6 +195,34 @@ columns:
start: 100 # Starts from 100 (default 1)
```
+##### ref
+
+Reference generator, randomly uses values from other `table.column`.
+Typically used for columns from different tables but has the same values, like relational columns `t1 JOIN t2 ON t1.c1 = t2.c1` or `WHERE t1.c1 = t2.c1`:
+
+```yaml
+columns:
+ - name: t_int
+ # format: "1{{%6d}}"
+ gen:
+ ref: employees.department_id
+ limit: 100 # Randomly select 100 values (default 1000)
+
+ - name: t_struct # struct
+ fields:
+ - name: dp_id
+ gen:
+ ref: employees.department_id # ref can be used in nested rules
+ - name: name
+ gen:
+ ref: employees.name
+```
+
+> [!IMPORTANT]
+>
+> - The source tables that be referenced to must be generated together
+> - The references must not have deadlock
+
##### enum
Enum generator (aka. `enums`), randomly selects from given values, values can be literals or generators (the type will be inferred from parent generator). There is an optional config `weights` (can only be used with `enum`):
@@ -190,11 +236,13 @@ columns:
- name: t_str
gen:
- # randomly choose one literal or generators to generate value, each has 25% probability
+ # randomly choose one literal or generators to generate value, each has 20% probability
enum:
- "123"
- length: {min: 5, max: 10}
- format: "my name is {{username}}"
+ - gen:
+ ref: t1.c1
- gen:
enum: [1, 2, 3]
@@ -208,9 +256,9 @@ columns:
##### parts
-Must be used together with [`format`](#format). Flexibly combine multiple values to produce the final result.
+Must be used together with [`format`]. Flexibly combine multiple values to produce the final result.
-`parts` generates multiple values at a time and fills them into `{{%xxx}}` of [`format`](#format) in order. The value of each part can be a literal or a generator(the type will be inferred from parent generator):
+`parts` generates multiple values at a time and fills them into `{{%xxx}}` of [`format`]:
```yaml
columns:
@@ -223,9 +271,7 @@ columns:
min: 1
max: 12
- gen: # day
- type: int
- min: 1
- max: 20
+ ref: table1.column1
- name: t_null_char # char(10)
format: "{{%s}}--{{%02d}}" # parts must be used with format
@@ -236,34 +282,6 @@ columns:
enum: [2, 4, 6, 8, 10]
```
-##### ref
-
-Reference generator, randomly uses values from other `table.column`.
-Typically used for relational columns, like `t1 JOIN t2 ON t1.c1 = t2.c1` or `WHERE t1.c1 = t2.c1`:
-
-```yaml
-columns:
- - name: t_int
- # format: "1{{%6d}}"
- gen:
- ref: employees.department_id
- limit: 100 # Randomly select 100 values (default 1000)
-
- - name: t_struct # struct
- fields:
- - name: dp_id
- gen:
- ref: employees.department_id
- - name: name
- gen:
- ref: employees.name
-```
-
-> [!IMPORTANT]
->
-> - The source tables that be referenced to must be generated together
-> - The references must not have deadlock
-
##### type
Uses the generator of another type. For example, generating values for a `varchar` column using an `int` type generator:
@@ -285,15 +303,12 @@ columns:
- name: t_varchar2
gen:
type: struct
- # fields: # Optional: Define rules for foo and bar if needed
- # - name: foo
- # gen:
- # inc: 1
- # start: 1000
```
##### golang
+P.s. This feature should be used exclusively as a last resort due to its poor readability – strongly consider using alternative functionality instead.
+
Uses Go code for a custom generator, supports Go stdlib:
```yaml
diff --git a/src/prompt/gendata.xml.tpl b/src/prompt/gendata.xml.tpl
index 3a2d1cb..014f8dd 100644
--- a/src/prompt/gendata.xml.tpl
+++ b/src/prompt/gendata.xml.tpl
@@ -5,11 +5,12 @@ Your task is generating YAML configurations for the data generation tool dodo (u
-1. The generated data must be able to be inserted into the tables in user prompt, constraints like UNIQUE KEY and PARTITIONS must be satisfied
-2. The generated data must be able to be queried by user's queries
-3. The YAML configurations should according to 'usage' below. Do not use rule key in `gendata.yaml` that haven't been documented in 'usage'
+1. The generated data must be able to be queried by user's queries
+2. The generated data must be able to be inserted into the tables in user prompt, constraints like UNIQUE KEY and PARTITIONS must be satisfied
+3. The YAML configurations should according to 'usage' below. Do not use generation rules that haven't been documented
4. When column stats conflict with queries conditions, prioritize queries conditions and ignore column stats
-5. Output should be a valid YAML and do not output anything else except YAML
+5. No need to generate rules for columns that are not used in queries conditions (like JOIN and WHERE)
+6. Output should be a valid YAML and do not output anything else except YAML
diff --git a/src/prompt/generate.sh b/src/prompt/generate.sh
index a7deb0e..0a8da95 100755
--- a/src/prompt/generate.sh
+++ b/src/prompt/generate.sh
@@ -1,6 +1,6 @@
#!/bin/bash
-set -exuo pipefail
+set -euo pipefail
cp gendata.xml.tpl gendata.xml.new
@@ -22,5 +22,6 @@ sd '^「example」$' "$example" gendata.xml.new
sd '^「tables」$' "$example_tables" gendata.xml.new
sd '^「column-stats」$' "$example_stats" gendata.xml.new
sd '^「format-tags」$' "$format_tags" gendata.xml.new
+sd '\]\(.*\)' ']' gendata.xml.new # remove Markdown links
mv gendata.xml gendata.xml.old || true
mv gendata.xml.new gendata.xml
diff --git a/src/replay.go b/src/replay.go
index 4b06578..78a749e 100644
--- a/src/replay.go
+++ b/src/replay.go
@@ -104,7 +104,7 @@ func (c *ReplayClient) conn(ctx context.Context, currdb string, reconnect ...boo
clusterId = fmt.Sprintf("@`%s`", c.cluster)
}
if _, err := c.connect.ExecContext(ctx, fmt.Sprintf("use `%s`%s", currdb, clusterId)); err != nil {
- logrus.Errorf("client %s switching to db %s failed, err: %v\n", c.client, currdb, err)
+ logrus.Errorf("client %s switching to db %s failed, err: %v", c.client, currdb, err)
return nil, err
}
logrus.Traceln("switching to db", currdb)
@@ -151,7 +151,7 @@ func (c *ReplayClient) writeResult(b []byte) (err error) {
resultFilePath := filepath.Join(c.resultDir, fmt.Sprintf("%s%s", c.client, ReplayResultFileExt))
c.resultFile, err = os.OpenFile(resultFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600)
if err != nil {
- logrus.Errorf("open replay result file %s failed, err: %v\n", resultFilePath, err)
+ logrus.Errorf("open replay result file %s failed, err: %v", resultFilePath, err)
return err
}
}
@@ -215,7 +215,7 @@ func (c *ReplayClient) consumeHash() string {
}
func (c *ReplayClient) replay(ctx context.Context) error {
- logrus.Debugf("replay %d sqls for client %s\n", len(c.sqls), c.client)
+ logrus.Debugf("replay %d sqls for client %s", len(c.sqls), c.client)
var (
prevTs = c.minTs
@@ -248,13 +248,13 @@ func (c *ReplayClient) replay(ctx context.Context) error {
)
r, durationMs, err := c.queryWithReconnect(ctx, s.Db, s.Stmt)
if err != nil {
- logrus.Debugf("client %s executed sql failed at query_id: %s, err: %v\n", c.client, s.QueryId, err)
+ logrus.Debugf("client %s executed sql failed at query_id: %s, err: %v", c.client, s.QueryId, err)
} else {
for r.Next() {
rowCount++
if rowCount < c.maxHashRows {
if err = c.appendHash(r); err != nil {
- logrus.Errorf("scan sql return rows failed, query_id: %s, err: %v\n", s.QueryId, err)
+ logrus.Errorf("scan sql return rows failed, query_id: %s, err: %v", s.QueryId, err)
break
}
}
@@ -290,7 +290,7 @@ func (c *ReplayClient) replay(ctx context.Context) error {
}
}
- logrus.Debugf("client %s replay done\n", c.client)
+ logrus.Debugf("client %s replay done", c.client)
return nil
}
@@ -312,7 +312,7 @@ func ReplaySqls(
parallel = len(clientSqls)
}
- logrus.Infof("Replay with %d client, parallel %d, started at %v, speed %f\n",
+ logrus.Infof("Replay with %d client, parallel %d, started at %v, speed %f",
len(clientSqls),
parallel,
time.UnixMilli(minTs).UTC().Format("2006-01-02 15:04:05"),
diff --git a/src/ssh.go b/src/ssh.go
index ebfb6ac..64e0880 100644
--- a/src/ssh.go
+++ b/src/ssh.go
@@ -74,9 +74,9 @@ func ScpFromRemote(ctx context.Context, privKey, remoteUrl, localPath string) er
}
if logrus.GetLevel() < logrus.DebugLevel {
- logrus.Infof("downloading %s to %s\n", remotePath, localPath)
+ logrus.Infof("downloading %s to %s", remotePath, localPath)
} else {
- logrus.Infof("downloading %s@%s%s to %s\n", user, host, remotePath, localPath)
+ logrus.Infof("downloading %s@%s%s to %s", user, host, remotePath, localPath)
}
// Create a new SCP client
@@ -108,7 +108,7 @@ func ScpFromRemote(ctx context.Context, privKey, remoteUrl, localPath string) er
err = client.CopyFromRemote(ctx, f, remotePath)
if err != nil {
- logrus.Errorf("Error while copying file from host %s, err: %v\n", host, err)
+ logrus.Errorf("Error while copying file from host %s, err: %v", host, err)
return err
}