From 205bf4d91b47c4dfbd00cf3de81c6540e41c64c3 Mon Sep 17 00:00:00 2001 From: Kord Campbell Date: Sun, 5 Feb 2023 15:46:44 -0600 Subject: [PATCH 1/5] weaviate compose --- bot.py | 7 ++----- scripts/docker-compose.yml | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 5 deletions(-) create mode 100644 scripts/docker-compose.yml diff --git a/bot.py b/bot.py index 5d446bf..2325a78 100644 --- a/bot.py +++ b/bot.py @@ -201,18 +201,16 @@ async def on_message(message): if document.get('template_file', "eject_document") == "eject_document": if document.get('use_sql') and document.get('table_to_use'): - await message.channel.send("Ejecting document from the query pipeline. Use the :thumbsup: emoji to continue, or reply in a thread below to inject SQL.") + await message.channel.send("Use the :thumbsup: emoji or reply in thread below to execute SQL.") if document.get('chart_type'): await message.channel.send("Would use the *%s* database projected to a %s." % (document.get('table_to_use'), document.get('chart_type'))) else: await message.channel.send("Would use the *%s* database to run a query." % (document.get('table_to_use'))) - else: - await message.channel.send("Ejecting document from the query pipeline.") await message.channel.send(document.get("explain")) - print(document) + # create a history document and send to weaviate history_document = { "author": document.get('author'), "plain": document.get('plain'), @@ -223,7 +221,6 @@ async def on_message(message): "type_of_chart": document.get('type_of_chart') } data_uuid = weaviate_update(history_document, "History") - print(data_uuid) else: await message.channel.send(document.get("explain")) diff --git a/scripts/docker-compose.yml b/scripts/docker-compose.yml new file mode 100644 index 0000000..3cc3667 --- /dev/null +++ b/scripts/docker-compose.yml @@ -0,0 +1,24 @@ +--- +version: '3.4' +services: + weaviate: + command: + - --host + - 0.0.0.0 + - --port + - '8080' + - --scheme + - http + image: semitechnologies/weaviate:1.17.2 + ports: + - 8080:8080 + restart: on-failure:0 + environment: + OPENAI_APIKEY: $OPENAI_APIKEY + QUERY_DEFAULTS_LIMIT: 25 + AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' + PERSISTENCE_DATA_PATH: '/var/lib/weaviate' + DEFAULT_VECTORIZER_MODULE: 'text2vec-openai' + ENABLE_MODULES: 'text2vec-openai' + CLUSTER_HOSTNAME: 'node1' +... From 77ce0aa4b2e829b83e9067cd3be8c2365119debd Mon Sep 17 00:00:00 2001 From: Kord Campbell Date: Mon, 6 Feb 2023 17:12:46 -0600 Subject: [PATCH 2/5] weaviate in --- ai.py | 28 +++++++++-- bot.py | 88 +++++++++++++++++++++++++++------- database.py | 25 ++++++---- sample_weaviate_queries.py | 16 +++++-- templates/determine_intent.txt | 20 ++++---- weaviate_schema.json | 29 ++++++----- 6 files changed, 148 insertions(+), 58 deletions(-) diff --git a/ai.py b/ai.py index e93952c..0069656 100644 --- a/ai.py +++ b/ai.py @@ -8,6 +8,8 @@ from string import Template +from database import weaviate_query + import config # AI model call by method name @@ -143,7 +145,24 @@ def query(document): # random number for ids document['random'] = int(random.random()*1000000000) + for distance in range(0, 10): + intents = weaviate_query({"concepts": [document.get('plain')]}, "Intent", float(distance/10)) + + if len(intents) > 3: + break + print("==============") + print(intents) + print("==============") + + _intents = [] + for intent in intents: + intent.pop('_additional') + _intents.append(intent) + + + document['intents'] = _intents + # substitute things template = load_template(template_file) prompt = template.substitute(document) @@ -166,7 +185,11 @@ def query(document): print(ex) print(answer) print("===============================") - document['explain'] = "I had problems returning a valid response." + if not document.get('explain', None): + document['explain'] = "I had problems returning a valid response." + + document['error'] = ex + document['is_sql'] = False document['template_file'] = "eject_document" @@ -189,10 +212,9 @@ def feedback(document, template_file="sql_feedback"): template = load_template(template_file) prompt = template.substitute(prompt_data) - print(prompt) answer = gpt3_completion(prompt) - print(answer) + try: answer_dict = eval('{%s' % (answer.strip("\n").strip(" "))) except: diff --git a/bot.py b/bot.py index 2325a78..e61e812 100644 --- a/bot.py +++ b/bot.py @@ -13,7 +13,7 @@ from ai import ai from database import featurebase_tables_schema, featurebase_tables_string, featurebase_query -from database import weaviate_update, weaviate_query +from database import weaviate_update, weaviate_query, weaviate_delete import weaviate @@ -85,12 +85,16 @@ async def on_ready(): async def on_reaction_add(reaction, user): channel = client.get_channel(reaction.message.channel.id) if reaction.message.channel.id == 1067446497253265410: + if "uuid:" in reaction.message.content: + uuid = reaction.message.content.split("uuid: ")[1] + weaviate_delete(uuid, "Intent") + await channel.send("Deleted document %s from Weaviate." % uuid) + return + await channel.send("Standby...running query.") document = {"concepts": [reaction.message.content]} - result = weaviate_query(document, "History", 0.2) - - print(result) + result = weaviate_query(document, "Intent", 0.2) # refactor # run until we get SQL, or an explaination/answer @@ -112,13 +116,13 @@ async def on_reaction_add(reaction, user): pretty_table.add_row(entry) table_string = "```\n%s\n```" % pretty_table - await message.channel.send(document.get('explain')) - await message.channel.send(document.get('sql')) - await message.channel.send(table_string) + await channel.send(document.get('explain')) + await channel.send(document.get('sql')) + await channel.send(table_string) else: - await message.channel.send("It goes without saying, we have to be here.") - await message.channel.send(document.get('explain')) + await channel.send("It goes without saying, we have to be here.") + await channel.send(document.get('explain')) return @@ -141,6 +145,39 @@ async def on_message(message): await message.channel.send("Sorry %s, I can't work in here. See the #bot-dev channel!" % message.author.name) return + if message.content.lower().startswith("delete ") and message.author.name == "Kord": + uuid = message.content.split(" ")[1] + weaviate_delete(uuid, "Intent") + await message.channel.send("Deleted document %s from Weaviate." % uuid) + return + + # test message content for json + try: + document = json.loads(message.content) + if document.get('sql', False): + use_sql = True + else: + use_sql = False + + if document.get('type_of_chart', False): + use_chart = True + else: + use_chart = False + + intent_document = { + "author": message.author.name, + "plain": document.get('plain'), + "explain": document.get('explain'), + "sql": document.get('sql'), + "table": document.get('table'), + "display_type": document.get('display_type') + } + data_uuid = weaviate_update(intent_document, "Intent") + await message.channel.send("Document inserted into Weaviate with uuid: %s" % data_uuid) + + except: + pass + # who wants dallE? # creates images from a prompt if message.content.lower().startswith("dream "): @@ -200,29 +237,44 @@ async def on_message(message): if document.get('template_file', "eject_document") == "eject_document": - if document.get('use_sql') and document.get('table_to_use'): + """ + if document.get('sql') and document.get('table_to_use'): await message.channel.send("Use the :thumbsup: emoji or reply in thread below to execute SQL.") if document.get('chart_type'): await message.channel.send("Would use the *%s* database projected to a %s." % (document.get('table_to_use'), document.get('chart_type'))) else: await message.channel.send("Would use the *%s* database to run a query." % (document.get('table_to_use'))) - + """ await message.channel.send(document.get("explain")) + if document.get("sql"): + document = featurebase_query(document) + + if document.get('error', False): + await message.channel.send("Got an answer, but no data.") + elif document.get('data', []): + await message.channel.send(document.get('data')) + + # create a history document and send to weaviate - history_document = { + intent = { "author": document.get('author'), "plain": document.get('plain'), "explain": document.get('explain'), - "use_sql": document.get('use_sql'), - "use_chart": document.get('use_chart'), - "table_to_use": document.get('table_to_use'), - "type_of_chart": document.get('type_of_chart') - } - data_uuid = weaviate_update(history_document, "History") + "sql": document.get('sql'), + "table": document.get('table'), + "display_type": document.get('display_type') + } + + data_uuid = weaviate_update(intent, "Intent") + await message.channel.send("Document inserted into Weaviate with uuid: %s" % data_uuid) else: + await message.channel.send("Document was not ejected.") + if document.get('template_file', ""): + await message.channel.send(document.get("template_file")) + await message.channel.send(document.get("explain")) return diff --git a/database.py b/database.py index 138a56b..7619ff2 100644 --- a/database.py +++ b/database.py @@ -120,11 +120,10 @@ def featurebase_query(document): try: query = document.get("sql") result = requests.post( - url, + config.featurebase_url+"/sql", data=query.encode('utf-8'), headers={'Content-Type': 'text/plain'} ).json() - print(result) except Exception as ex: # bad query? exc_type, exc_obj, exc_tb = sys.exc_info() @@ -149,10 +148,7 @@ def featurebase_query(document): document['template_file'] = "process_response" else: - print(result) - document['error'] = "No useful information was returned." - document['is_sql'] = 'False' - document['explain'] = "(╯°□°)╯︵ ┻━┻" + document['explain'] = "Query was successful, but returned no data." document['template_file'] = "eject_document" # forces the document flow to stop return document @@ -174,19 +170,22 @@ def weaviate_query(document, collection, distance=0.5): # fetch result and fields result = ( weaviate_client.query - .get(collection, ["plain","author", "explain","chart_type","table_to_use"]) + .get(collection, ["plain", "author", "explain", "table", "sql", "display_type"]) .with_additional(["certainty", "distance", "id"]) .with_near_text(nearText) .do() ) + _records = [] + for record in result.get('data').get('Get').get(collection): - print(record.get('_additional').get('certainty'), "|", record.get('_additional').get('distance'), "|", record.get('table'), "|", record.get('sql')) + _records.append(record) - return + return _records # send a document to a class/collection def weaviate_update(document, collection): + print(document, collection) try: data_uuid = weaviate_client.data_object.create(document, collection) @@ -196,3 +195,11 @@ def weaviate_update(document, collection): return data_uuid +def weaviate_delete(uuid, collection): + try: + weaviate_client.data_object.delete(uuid, collection) + except Exception as ex: + print(ex) + + return + diff --git a/sample_weaviate_queries.py b/sample_weaviate_queries.py index 32e22d7..40937f6 100644 --- a/sample_weaviate_queries.py +++ b/sample_weaviate_queries.py @@ -1,5 +1,6 @@ import weaviate import config +import pprint from database import weaviate_query @@ -11,12 +12,17 @@ ) -all_objects = client.data_object.get(class_name="History") -print(all_objects) +# all_objects = client.data_object.get(class_name="Intent") +# print(all_objects) +for distance in range(0, 10): + intents = weaviate_query({"concepts": "planets"}, "Intent", float(distance/10)) -result = weaviate_query({"concepts": "select all from planets"}, "History", 0.3) -print(result) + if len(intents) > 3: + break + +pp = pprint.PrettyPrinter(indent=4) +pp.pprint(intents) """ for doc in documents: @@ -58,4 +64,4 @@ for record in result.get('data').get('Get').get('Slothbot'): print(record.get('_additional').get('certainty'), "|", record.get('_additional').get('distance'), "|", record.get('table'), "|", record.get('sql')) -""" \ No newline at end of file +""" diff --git a/templates/determine_intent.txt b/templates/determine_intent.txt index 11c997f..5435d56 100644 --- a/templates/determine_intent.txt +++ b/templates/determine_intent.txt @@ -1,20 +1,24 @@ -1. You are a bot that manages a discord channel. You have access to a SQL database backed by Pilosa/FeatureBase. +1. You are a bot that manages a discord channel. You have access to a SQL database backed by FeatureBase. + +2. FeatureBase is an open source, distributed bitmap index that is optimized for speed and scalability. It allows users to quickly analyze and scan massive datasets in real-time using a column-oriented data model, making it ideal for aggregations, rollups, and range scans. In addition, FeatureBase supports the use of SQL for expressing complex data operations, and provides a Discord Bot that can write SQL for you. It also supports multi-dimensional data analysis, providing powerful insights into your data. 2. The current list of tables is: $tables -3. A user named "Kord" has entered a message: +3. A user named "$author" has entered a message: $plain -4. Format rules: Use a boolean True or False to fill in boolean fields. Do not use lowercase letters for these completions! Use double quotes " for strings. +4. Here's an example response. Ensure you use all these fields in your completion: +{"author": "$author", plain": "Hello.", "explain": "Hello there.", "sql": ";", "display_type": "text", "table": "None"} 5. Examples from previous interactions: -{"plain": "show me a graph of signups", "use_chart": True, "use_sql": True, "eject_document": False, "chart_type": "barchart", "table_to_use": "website_visits", "explain": "It looks like you are trying to get me to graph something."} -{"plain": "do you like pie", "use_chart": False, "use_sql": False, "eject_document": True, "chart_type": "", "table_to_use": "", "explain": "My favorite is chart pie." } +$intents 6. If you cannot determine a table and chart type, just answer the plain query from the user using the "explain" key. -7. You may offer the user help by telling them to put "help" in front of their question about using the system. +7. If you need an _id number to create a table, you may use: $random. _id is always type id. Other types include set, stringset, string, decimal. + +8. You may offer the user help by telling them to put "help" in front of their question about using the system. -8. Determine user intent in message and put an explaination of your response in "explain". System will determine next steps. Return a python dictionary: -{"plain": "$plain", \ No newline at end of file +9. Determine user intent and Return a python dictionary: +{"user": "$author", plain": "$plain", \ No newline at end of file diff --git a/weaviate_schema.json b/weaviate_schema.json index bb9a951..b5fad12 100644 --- a/weaviate_schema.json +++ b/weaviate_schema.json @@ -1,8 +1,8 @@ { "classes": [ { - "class": "History", - "description": "History index.", + "class": "Intent", + "description": "Intent index.", "vectorIndexConfig": { "vectorCacheMaxObjects": 100000, "ef": 2500 @@ -56,24 +56,23 @@ }, { "dataType": [ - "boolean" - ], - "description": "Whether or not a SQL statement can be written from the plain query.", - "name": "use_sql" - }, - { - "dataType": [ - "boolean" + "string" ], - "description": "Whether or not a SQL statement can be written from the plain query.", - "name": "use_chart" + "description": "Possible SQL query to run.", + "name": "sql", + "moduleConfig": { + "text2vec-openai": { + "skip": false, + "vectorizePropertyName": false + } + } }, { "dataType": [ "string" ], "description": "Table to query.", - "name": "table_to_use", + "name": "table", "moduleConfig": { "text2vec-openai": { "skip": false, @@ -85,8 +84,8 @@ "dataType": [ "string" ], - "description": "Type of chart.", - "name": "chart_type", + "description": "Display type. One of ['text', 'bar_chart', 'pie_chart', 'table']", + "name": "display_type", "moduleConfig": { "text2vec-openai": { "skip": false, From a0f24e19dbf8179c742f7e8fe926d82df64cd594 Mon Sep 17 00:00:00 2001 From: Kord Campbell Date: Wed, 8 Feb 2023 13:02:19 -0600 Subject: [PATCH 3/5] pandas to requirements --- ai.py | 12 +++--- bot.py | 71 ++++++++++++++-------------------- database.py | 11 ++++-- requirements.txt | 1 + sample_weaviate_queries.py | 6 +-- templates/determine_intent.txt | 17 ++++---- 6 files changed, 56 insertions(+), 62 deletions(-) diff --git a/ai.py b/ai.py index 0069656..a97102f 100644 --- a/ai.py +++ b/ai.py @@ -148,27 +148,25 @@ def query(document): for distance in range(0, 10): intents = weaviate_query({"concepts": [document.get('plain')]}, "Intent", float(distance/10)) - if len(intents) > 3: + if len(intents) > 5: break - - print("==============") - print(intents) - print("==============") _intents = [] for intent in intents: intent.pop('_additional') _intents.append(intent) - document['intents'] = _intents # substitute things template = load_template(template_file) prompt = template.substitute(document) - + print("===================") + print(prompt) # ask GPT-3 for an answer answer = gpt3_completion(prompt) + print(answer) + print("===================") # try to eval the result try: diff --git a/bot.py b/bot.py index e61e812..92699d7 100644 --- a/bot.py +++ b/bot.py @@ -227,55 +227,42 @@ async def on_message(message): document = { "plain": message.content, "author": message.author.name, - # replace this with a weaviate query - # "history": history_thing + "tables_schema": featurebase_tables_schema(), "tables": featurebase_tables_string(), } # retreive document results from AI document = ai("query", document) - if document.get('template_file', "eject_document") == "eject_document": - - """ - if document.get('sql') and document.get('table_to_use'): - await message.channel.send("Use the :thumbsup: emoji or reply in thread below to execute SQL.") - - if document.get('chart_type'): - await message.channel.send("Would use the *%s* database projected to a %s." % (document.get('table_to_use'), document.get('chart_type'))) - else: - await message.channel.send("Would use the *%s* database to run a query." % (document.get('table_to_use'))) - """ - await message.channel.send(document.get("explain")) - - if document.get("sql"): - document = featurebase_query(document) - - if document.get('error', False): - await message.channel.send("Got an answer, but no data.") - elif document.get('data', []): - await message.channel.send(document.get('data')) - - - # create a history document and send to weaviate - intent = { - "author": document.get('author'), - "plain": document.get('plain'), - "explain": document.get('explain'), - "sql": document.get('sql'), - "table": document.get('table'), - "display_type": document.get('display_type') - } - - data_uuid = weaviate_update(intent, "Intent") - await message.channel.send("Document inserted into Weaviate with uuid: %s" % data_uuid) - else: - await message.channel.send("Document was not ejected.") - if document.get('template_file', ""): - await message.channel.send(document.get("template_file")) - - await message.channel.send(document.get("explain")) + await message.channel.send(document.get("explain")) + + print(document) + if document.get("sql") and document.get('table'): + document = featurebase_query(document) + print("return from featurebase") + print(document) + if document.get('error', False): + await message.channel.send("Got an answer, but no data.") + await message.channel.send(document.get("explain")) + await message.channel.send(document.get("error")) + elif document.get('data', []): + await message.channel.send(document.get('data')) + + + # create a history document and send to weaviate + intent = { + "author": document.get('author'), + "plain": document.get('plain'), + "explain": document.get('explain'), + "sql": document.get('sql'), + "table": document.get('table'), + "display_type": document.get('display_type') + } + + data_uuid = weaviate_update(intent, "Intent") + await message.channel.send("Document inserted into Weaviate with uuid: %s" % data_uuid) + return diff --git a/database.py b/database.py index 7619ff2..9b6c51c 100644 --- a/database.py +++ b/database.py @@ -117,6 +117,9 @@ def featurebase_tables_string(table_name=None): # "sql" key in document should have a valid query def featurebase_query(document): # try to run the query + print("USING QUERY") + print(document.get('sql')) + print("USING QUERY") try: query = document.get("sql") result = requests.post( @@ -124,22 +127,24 @@ def featurebase_query(document): data=query.encode('utf-8'), headers={'Content-Type': 'text/plain'} ).json() + print(result.get('data')) except Exception as ex: # bad query? exc_type, exc_obj, exc_tb = sys.exc_info() + print("=============") print(exc_type, exc_obj, exc_tb) - + print("=============") document['explain'] = "(╯°□°)╯︵ ┻━┻" document['error'] = "%s: %s" % (exc_tb.tb_lineno, ex) document.pop('template_file', None) - document['is_sql'] = 'False' + return document if result.get('error', ""): # featurebase reports and error document['explain'] = "Error returned by FeatureBase: %s" % result.get('error') document['error'] = result.get('error') - document['is_sql'] = 'False' + document['data'] = result.get('data') document['template_file'] = "handle_error" elif result.get('data', []): diff --git a/requirements.txt b/requirements.txt index 1862918..283df43 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ discord==1.7.3 openai==0.26.1 prettytable==3.6.0 +pandas==1.5.3 weaviate-client==3.11.0 plotly==5.13.0 randomwords==0.4.0 diff --git a/sample_weaviate_queries.py b/sample_weaviate_queries.py index 40937f6..64fe859 100644 --- a/sample_weaviate_queries.py +++ b/sample_weaviate_queries.py @@ -12,13 +12,13 @@ ) -# all_objects = client.data_object.get(class_name="Intent") -# print(all_objects) +#all_objects = client.data_object.get(class_name="Intent") +#print(all_objects) for distance in range(0, 10): intents = weaviate_query({"concepts": "planets"}, "Intent", float(distance/10)) - if len(intents) > 3: + if len(intents) > 6: break pp = pprint.PrettyPrinter(indent=4) diff --git a/templates/determine_intent.txt b/templates/determine_intent.txt index 5435d56..a194c5e 100644 --- a/templates/determine_intent.txt +++ b/templates/determine_intent.txt @@ -5,20 +5,23 @@ 2. The current list of tables is: $tables -3. A user named "$author" has entered a message: +3. The current list of tables with schemas is: +$tables_schema + +4. A user named "$author" has entered a message: $plain -4. Here's an example response. Ensure you use all these fields in your completion: +5. Here's an example response. Ensure you use all these fields in your completion: {"author": "$author", plain": "Hello.", "explain": "Hello there.", "sql": ";", "display_type": "text", "table": "None"} -5. Examples from previous interactions: +6. Examples from previous interactions: $intents -6. If you cannot determine a table and chart type, just answer the plain query from the user using the "explain" key. +7. If you cannot determine a table and chart type, just answer the plain query from the user using the "explain" key. -7. If you need an _id number to create a table, you may use: $random. _id is always type id. Other types include set, stringset, string, decimal. +8. If you need an _id number to create a table, you may use: $random. _id is always type id. Other types include set, stringset, string, decimal. -8. You may offer the user help by telling them to put "help" in front of their question about using the system. +9. You may offer the user help by telling them to put "help" in front of their question about using the system. -9. Determine user intent and Return a python dictionary: +10. Determine user intent and Return a python dictionary: {"user": "$author", plain": "$plain", \ No newline at end of file From 62c40de63d30648ce3db081a53a3c150cd5b5d70 Mon Sep 17 00:00:00 2001 From: Kord Campbell Date: Wed, 8 Feb 2023 13:30:37 -0600 Subject: [PATCH 4/5] instructions --- README.md | 9 +++++++++ ai.py | 5 +---- bot.py | 3 +-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index f20c8b7..686a12b 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,18 @@ Slothbot uses the following software/libraries: - PrettyTables ## Setup +You'll need some Python packages installed. + +```pip3 install -r requirements.txt``` ### FeatureBase ### Weaviate +To start Weaviate ensure you have Docker. + +Change into the scripts directory: + +```docker compose up``` + ### Discord diff --git a/ai.py b/ai.py index a97102f..542035a 100644 --- a/ai.py +++ b/ai.py @@ -161,12 +161,9 @@ def query(document): # substitute things template = load_template(template_file) prompt = template.substitute(document) - print("===================") - print(prompt) + # ask GPT-3 for an answer answer = gpt3_completion(prompt) - print(answer) - print("===================") # try to eval the result try: diff --git a/bot.py b/bot.py index 92699d7..7fc48c3 100644 --- a/bot.py +++ b/bot.py @@ -240,8 +240,7 @@ async def on_message(message): print(document) if document.get("sql") and document.get('table'): document = featurebase_query(document) - print("return from featurebase") - print(document) + if document.get('error', False): await message.channel.send("Got an answer, but no data.") await message.channel.send(document.get("explain")) From 0d0d3afb820072195b8cbf81f0e057de65713cd1 Mon Sep 17 00:00:00 2001 From: rutherford-jasper Date: Wed, 8 Feb 2023 17:06:18 -0600 Subject: [PATCH 5/5] Hackathon results --- ai.py | 30 +++++++++ bot.py | 41 +++++++----- database.py | 2 +- ingestjsons.py | 7 +++ out.json | 118 ++++++++++++++++++++++++++++++++++ sample_weaviate_queries.py | 12 ++-- templates/support.txt | 7 +++ weaviate_schema.json | 125 +------------------------------------ 8 files changed, 198 insertions(+), 144 deletions(-) create mode 100644 ingestjsons.py create mode 100644 out.json create mode 100644 templates/support.txt diff --git a/ai.py b/ai.py index 542035a..c3f9659 100644 --- a/ai.py +++ b/ai.py @@ -130,6 +130,36 @@ def help(document, template_file="help"): document['explain'] = gpt3_completion(prompt, temperature=0.85, max_tokens=256).strip("\n") return document +@model +def support(document, template_file="support"): + # load openai key then drop it from the document + openai.api_key = document.get('openai_token') + document.pop('openai_token', None) + + # call weaviate with document.plain + for distance in range(0, 10): + intents = weaviate_query({"concepts": document.get('plain')}, "Support", float(distance / 10.0)) + if len(intents) > 6: + break + + _intents = [] + for intent in intents: + intent.pop('_additional') + _intents.append(intent) + + # print(_intents) + # document['intents'] = _intents + + # print(document) + + template = load_template(template_file) + prompt = template.substitute({"author": document.get('author'), "plain": document.get('plain'), "intents": _intents}) + + # print(prompt) + + document['explain'] = gpt3_completion(prompt, temperature=0.85, max_tokens=256).strip("\n") + return document + # uses templates in templates directory # set template using document key "template_file" diff --git a/bot.py b/bot.py index 7fc48c3..0d533db 100644 --- a/bot.py +++ b/bot.py @@ -28,7 +28,7 @@ def random_string(size=6, chars=string.ascii_letters + string.digits): # discord intents intents = discord.Intents.default() -# intents.message_content = True +intents.message_content = True client = discord.Client(intents=intents) # discord log @@ -47,22 +47,22 @@ async def on_ready(): # show reboot of bot channel = client.get_channel(1067446497253265410) - # test FeatureBase connection - _tables = featurebase_tables_string() + # # test FeatureBase connection + # _tables = featurebase_tables_string() await channel.send("-") - if not _tables: - await channel.send("Couldn't query FeatureBase for tables.") - else: - await channel.send("%s tables availabe via FeatureBase: %s" % (len(_tables.split(",")), _tables)) + # if not _tables: + # await channel.send("Couldn't query FeatureBase for tables.") + # else: + # await channel.send("%s tables availabe via FeatureBase: %s" % (len(_tables.split(",")), _tables)) # connect to weaviate and ensure schema exists try: weaviate_client = weaviate.Client("http://localhost:8080") # Need to reset Weaviate? - # weaviate_client.schema.delete_all() + weaviate_client.schema.delete_all() # make schemas if none found if not weaviate_client.schema.contains(): @@ -136,6 +136,22 @@ async def on_message(message): if message.author == client.user: return + print(message) + if message.content.lower().startswith("support "): + # create document + document = { + "plain": message.content, + "author": message.author.name + } + + url = ai("support", document) + await message.channel.send(document.get('explain')) + + weaviate_update(document, "Support") + + return + + # stop bot from interactions in most channels # use #offtopic and #bot-dev only # allows op to interact everywhere @@ -165,14 +181,11 @@ async def on_message(message): use_chart = False intent_document = { - "author": message.author.name, + "author": document.get('author'), "plain": document.get('plain'), - "explain": document.get('explain'), - "sql": document.get('sql'), - "table": document.get('table'), - "display_type": document.get('display_type') + "explain": document.get('explain') } - data_uuid = weaviate_update(intent_document, "Intent") + data_uuid = weaviate_update(intent_document, "Support") await message.channel.send("Document inserted into Weaviate with uuid: %s" % data_uuid) except: diff --git a/database.py b/database.py index 9b6c51c..0864181 100644 --- a/database.py +++ b/database.py @@ -175,7 +175,7 @@ def weaviate_query(document, collection, distance=0.5): # fetch result and fields result = ( weaviate_client.query - .get(collection, ["plain", "author", "explain", "table", "sql", "display_type"]) + .get(collection, ["plain", "author", "explain"]) .with_additional(["certainty", "distance", "id"]) .with_near_text(nearText) .do() diff --git a/ingestjsons.py b/ingestjsons.py new file mode 100644 index 0000000..9f5d369 --- /dev/null +++ b/ingestjsons.py @@ -0,0 +1,7 @@ +from database import weaviate_update + +def loadMessages(): + with open("out.json") as outfile: + for line in outfile: + document = json.loads(line) + data_uuid = weaviate_update(document, "BotDev") \ No newline at end of file diff --git a/out.json b/out.json new file mode 100644 index 0000000..72ba997 --- /dev/null +++ b/out.json @@ -0,0 +1,118 @@ +{"plain": "I will explore using SETS soon, but it requires me to restructure my data ingestion pipeline slightly so I'll only do that in a few days", "explain": "`[measure_store]Groupby(Rows(int_value, in=[2230,2231] ))`", "author": "Garrett"} +{"plain": "<@996429686559211562> At this point in time, my \"int_value\" column is just an INT column and not a SET", "explain": "Ahh, I see", "author": "Garrett"} +{"plain": "No worries, thanks <@1013872606614671360> As mentioned in <#1009506305486114926> , my team and I are working from UK and South Africa (GMT and GMT + 2) so I appreciate that there's limited overlap between our time zones \ud83d\ude42", "explain": "Give me one moment and I'l get you the equivalent PQL", "author": "Garrett"} +{"plain": "Thanks <@1069586792191381515> - this isnt my area of expertise, but I did reach out to one of our in-house experts. They should be responding shortly.I looked into the GroupBy() PQL docs here https://docs.featurebase.com/pql-guide/read/groupby , but it didn't contain my example \ud83d\ude05What I'm trying to accomplish is the equivalent of\n```SELECT COUNT(*) from measure_store where int_value IN (2230, 2231) GROUP BY int_value;```I think I'm just doing something silly. I've moved on from SQL and looking more into PQL. I have the following query.\n```[measure_store]COUNT(UNION(Row(int_value = 2230), Row(int_value = 2231)))```Absolutely! If you have specific questions or data modeling you would like help with, please ask. We may not be able to get to responses immediately, but the more we can help you use the product the better for all of us \ud83d\ude42\ud83d\ude4f We're almost at that stage where we'd want to try the cloud trial \ud83d\ude42 Just trying to get a feel before diving in. Thanks!Hi <@1069586792191381515> - great question on the latency. We are currently in Ohio and performance for our existing customers has been excellent based on feedback across the country. I am not aware of any metrics for communication between N-Virginia. That said, please do try the cloud trial and let us know if you see anything non-performant. If so, we would love to work directly with you to figure out a path forward which works for your use case.Another question \ud83d\ude05 For your Cloud solution, I see that your current offering is in AWS us-east-1 (Ohio). We're also using AWS, but we're in us-east-1 (N. Virginia). Do you possibly have any latency metrics between these two regions? sub-50ms latency between our web application backend and FB would be ideal \ud83d\ude42Was wondering what exactly is incapsulated between these two values in the UI - query time vs. total roundtrip time.Hi tsu shiuan 3407 FeatureBase does notAs for AVG there aren t any known bugs", "explain": "Hi <@1069586792191381515>, you can do this in SQL with SETCONTAINS() https://docs.featurebase.com/sql-preview/sql-functions#syntax", "author": "Garrett"} +{"plain": "I attempted a workaround by executing the following, but struggled to get it to perform float/decimal division instead of integer division which looks like it rounds to the nearest integer \ud83d\ude05 \n```\nSELECT SUM(int_col) / COUNT(int_col) AS A WHERE ...some queries...; \n```The results I get for my table is `150 (COUNT) 334,500(SUM) 0.223(AVG)`And the results I get are correct for `COUNT` and `SUM` but weirdly incorrect for `AVG`Are there any known bugs with the AVG SQL function in FB? I'm performing the following query on an INT column:\n\n```\nSELECT count(int_col) AS A, SUM(int_col) AS B, AVG(int_col) WHERE ...some queries...; \n```Common usage is in this use case orI'll email you to set it up! Have a great weekend!that would be fun!We agree w you...let's keep pushing! I think this is a great use case and we are here to help however we can! If it would be valuable, we can set up a hack time via zoom next week to iterate on your work together??!lakes are used a lot when there's considerable scale but my feeling is that there's a lot of space for optimization and FB triggered my interest \ud83d\ude42create sessions is one, then various audiences (I'll probably enrich the records with more user metadata for that), funnels, campaign performances. That kind of stuff. What I'm looking for is performance and usability. \n\nCDP analytics are interesting to me because usually include a lot of data, the value per record is pretty low (reminds a bit of dealing with logs) and the data is consumed by non technical people (marketeers) so how you can interact with the data is important.", "explain": "As for AVG, there aren't any known bugs, however I'll open a ticket and do some testing based on your findings!so the work-around I would expect to return the rounded INTHi <@1069586792191381515>, FeatureBase does not yet perform float-point math unless using the newest Arrow() functionality is being used: https://docs.featurebase.com/pql-guide/read/arrow", "author": "Garrett"} +{"plain": "with the first schema where user_id is used as the ID, I assume there are 5k records inserted, while in the second case we end up with 7.5M. Is this right?although, the dataset has ~5,000 unique user_ids and ~,7.5M unique event_idsyeah it was a huge increase, from minutes to multiple hours.", "explain": "If so, what other analytics might you be interested in getting from this dataset?<@964740976759414805> \nIs the below a good summary? \n\nData Shape: \nuser_id: 5000 Unique\nEvent_id: 7,500,000 Unique\nTimestamp ISO8601 Time for each event_id\n\n`For a given user_id, find all the events that occur within 30-minutes of one another (ie. show me actions within a session assumed at max 30 minutes apart): \nselect\n *,\n datediff(\n 'ms',\n lag(time) over(\n partition by user_id\n order by\n time\n ),\n time\n ) / 1000 / 60.0 as idle_time_minutes \nfrom\n events;`That is correct, so you have many more events per user", "author": "Garrett"} +{"plain": "btw <@996429686559211562> I ingested the data using this schema: event_id__ID_T,user_id__ID_T,timestamp__timestamp it was much much slower than the previous ingestiontranslating batch of 4381 took: 1.916763679s\nmaking fragments for batch of 4381 took 2.294196ms\nimporting fragments took 11.833101222s\n\nwhat is each of the above steps responsible for? (ok importing fragments is self-explanatory \ud83d\ude04 )awesome, that's what I was thinking when I was going through the documentation and wanted to try the CDP operations using FB. Compression is important for that kind of data so to me it's not just a bonus, it's a great product feature.", "explain": "By the way, what environment/language do you use Python, etc?Rules of thumb, translating batches is typically a CPU heavier task, and imports will be dictated a bit by the disks you are using (without getting into the weeds too much, fsync and the way keys are handled requires reading/writing to disk)Did you see a large increase?As for your ingest time, since events are no longer in arrays coming in, they are being distributed across individual records ie more work, but it shouldn't be *that* much moreHey <@964740976759414805> , 1. Translating batch is taking all the input data and doing things such as create strings, 2. Making fragments you can think of as creating the underlying structures (roaring bitmaps) 3. Importing..well that's a bit self explanatory but involves pushing the data to your instance of FeatureBase", "author": "Garrett"} +{"plain": "what's the most common usage of FB so far?thank you so much <@996429686559211562> I'll keep working on it", "explain": "The complexity can get very high with calls(), you can do this type of sorting/filtering with 50-60 arguments in your SQL and refine down to a very small subset or cohort with the same latencyBonus: data is compressed at ingestionCommon usage is in this use-case, or close to it, laymans: FeatureBase does well with very wide and sparse datasets where you need to do aggregations over many entities (user_id, device_id,geolocation_id,etc) at runtime, so count(user_id) where segment=\"Happy\" over 100MM user_ids will be low latency without pre-agg or pre-processingOf course! Anytime, let me know if this works", "author": "Garrett"} +{"plain": "because I'll have to partition over usersand then I can use PQL to work with windows, right?ok I'll try with thatI see", "explain": "I'll think more about this tonight and let you know if there's another method too. There's some newer SQL functionality that allows casting I think could give you flexibility tooEach record will have a user_id, event_id, and timestampThat query would give you window based on the timestamp field ^", "author": "Garrett"} +{"plain": "for example, with the previous schema that I shared, I get the customer journey for free by just extracting a single user_idmy initial thought was that pretty much all the queries that are CDP related return user_ids so having the events aggregated per user and sorted, would help with performance and hopefully also by simplifying the queriesthat's interesting", "explain": "Then `select user_id, event_id where user_id = 1234 AND timestamp < and Timestamp >`then pulling events per user_id is easyHmmm, in that case you could have all be fields and use the `--auto-generate`, `--external-generate` to create a entity id for you", "author": "Garrett"} +{"plain": "and btw, what I'm trying to do is to see how FB could be used as the foundation for a \"customer data platform\" and I'm trying to run some standard operations commonly found in CDPs (like creating sessions)", "explain": "In FeatureBase you won't take performance hits for not having your events aggregated per user for most of common queries we seeNow, in some cases it may be better to have the primary entity (primary key OR id_field) as the user and in some cases the eventIf I change that to id_field=user_id with the same data, then you can `select event_id, timestamp where...`Then you can select those fields as columns in your SQL calls, so `select user_id , timestamp where ...`Now if I set id_field=event_id when I ingest the csv, it will create a field for user_id and a field for timestamp (ID, timestamp set respectively)User_id: 1234, Event_id:5678, Timestamp: 2-21-2046 01:15Example:So primary-key is for when you have a string as the unique ID (and still works if you have an INT/ID in there), whereas id-field grabs that ID in your data column and does no translation (because it can already be interpreted as a position)", "author": "Garrett"} +{"plain": "I feel I'm the worst FB user ever :/", "explain": "NOT in the slightest", "author": "Garrett"} +{"plain": "and having the user_id as a field instead of the key is ok, I was thinking that by doing that I would gain some performance as events are getting pre-aggregated over each user during ingestionok, what's the difference between the key and an ID field?", "explain": "Oh good question, so you have the ability to set primary-key OR --id-field", "author": "Garrett"} +{"plain": "can I do it with PQL?", "explain": "You'd add the select user_id to your query, then use an AND, Timestamp > t(0) AND Timestamp< t(0)+30 minutes to create your time windowThis would allow you store the granularity of time down to minute, the downside is your user_id becomes a field instead of the key`So event_id__ID_T, user_id__ID_T, time_stamp__timestamp where primary key = event_id`You could do this more traditionally by making your event_id the key and change user_id to an ID field underneath it and ingest your timestamp as timestamp field (instead of recordtime), then you would use an AND clausehmmm", "author": "Garrett"} +{"plain": "and I don't want to waste your time <@996429686559211562> if there's some documentation I can follow, I'll be happy to spend the time doing itthat would be great!", "explain": "In this statement since you added TQ to event_id, you could do [index]Rows(event_id, from=timestamp, to=timestamp) where timestamp is arbitrary so you'd do timestamp + 30 minutes, problem is the granularity bottoms out at hourlyhttps://docs.featurebase.com/pql-guide/read/rowsNo problem, this ability to do in SQL is what I'm checking, the time quantum portion of FeatureBase is being brought over from our DSL PQL", "author": "Garrett"} +{"plain": "typically you do that by using lag(), see an example:\n\nselect\n *,\n datediff(\n 'ms',\n lag(time) over(\n partition by user_id\n order by\n time\n ),\n time\n ) / 1000 / 60.0 as idle_time_minutes \nfrom\n events;now, I want to perform very basic sessionization, using the standard google analytics approach where a session is defined as a set of events that happened in less than 30 mins from each otherwhich is great!using the above types, by ingesting the data to FB I get a nice index where _id is the user_id and all the events are grouped per usermy dataset is simple, the following three columns: event_id__IDArray_YMDH, user_id__ID_T_YMDH, timestamp__RecordTime", "explain": "Let me see if I can translate to a SQL callIn this schema, you have event_id set up to get a granular timestamp, so you could effectively window using that time quantum", "author": "Garrett"} +{"plain": "let me tell you what I'm trying to do, because there might a way to get it done even without the ergonomics of window functionsoh ok", "explain": "Sounds good! Hit me", "author": "gthrone"} +{"plain": "is there support for sql window functions? Ideally I'd like to use lag() or a function with similar semantics.", "explain": "We don't support any windowing today, but we plan to!", "author": "gthrone"} +{"plain": "oh nice! I'll take a look at this hopefully today. Btw, please feel free to use me as guinea pig for that stuff if it helps. Also, regarding my small project, I was planning to just change the ids from uuids to some integer.", "explain": "<@1025093714420432907> Doing a support round-up for open issues, where you able to get going with <@284512437254750208>'s fix?This is the recommended path, if you can use ints as ids in lieu of uuids it will speed up ingest considerably", "author": "Garrett"} +{"plain": "yes, to be fair, my main issue so far has been with mapping types correctly and the naming convention of the column names that is used for that. For example, I have a dataset that has some hashes used for IDs, there are negative ints in there, so couldn't use it as an ID.", "explain": "<@964740976759414805> here's an example using a string for the _id: https://github.com/FeatureBaseDB/featurebase-examples/blob/main/bigdata-set/main.py#L219\n\nThat example isn't done, but if you fire it up it will generate data when you go to the page. Sorry for linking to an incomplete example, but it's been waiting on the release we should do today!Yup makes complete sense. We've been working on simplifying that a lot for SQL, but feel free to post any questions we can help ya with there!", "author": "gthrone"} +{"plain": "some of the parameters can be easily extracted from the stats that parquet has for exampleI think FB would benefit a lot from an ingester that reads from Parquetthanks <@284512437254750208> I'll probably move forward with what you described, generate CSVs and then ingest them.", "explain": "So you're aware, bulk ingest can directly consume CSV files as well like in this example (which calls a URL): https://docs.featurebase.com/sql-preview/sql-bulk-insert-csv or local files, in which case the `INPUT` becomes `FILE`.We've discussed reading from Parquet, so we'll certainly keep you posted there!", "author": "gthrone"} +{"plain": "whatever is considered the best practice for featurebase", "explain": "And, after talking to <@342764551122714624> about this a bit, I would say that we need an example of https://docs.featurebase.com/sql-preview/sql-bulk-insert in the example repo for a few different source types and formats.Ideally, we'd have a Python lib that did something SQLAlchemish for dicts, but it's pretty easy to insert with Python without it.The examples repo has all of these shown, more or less.There's an intermediate step, depending on source. If data is in Kafka, I could run the consumer to insert, or I could read from Kafka from Python and then create inserts with SQL.I have two ways I look at this. First, I could write code that generates CSVs and stores them to disk. At that point, I'd use our CSV consumer.\n\nSecond, I could just write a bit of extra code to spool up the inserts and POST them for indexing. That's what I do here: https://github.com/FeatureBaseDB/featurebase-examples/blob/main/docker-simple/main.py#L55-L63Ok cool! Ditto what Garrett said", "author": "gthrone"} +{"plain": "maybe I should rephrase my question, what's the preferred method for ingesting data into featurebase?the bulk insert works only with SQL right, if someone wants to write an ingester should still use that?", "explain": "<@964740976759414805> When you say write an ingester, are you talking about scripting around an existing endpoint (like the SQL endpoint), or something else?To level set, for programmatic ingest and development the primary/preferred method is using the newest SQL insert and bulk insert statements, in the event the workloads do not/cannot use SQL then the the fallback is using the supported ingesters (CSV,Kafka, etc).", "author": "Garrett"} +{"plain": "ingestion is performed through the Import data HTTP Endpoint (https://docs.featurebase.com/community/community-api/http-api#import-data) with the data serialized in protobuf?time to break more things! I'll be back.but I did it, I managed to ingest a few records!!oh well, I was expecting a more dramatic rendering of the gif \ud83d\ude33https://tenor.com/view/rocky-winning-gif-14332927", "explain": "<@964740976759414805> This import method is an older (vestigial) method and has not been contributed too in some time, I would recommend this bulk method if you're looking to ingest over HTTP: https://docs.featurebase.com/sql-preview/sql-bulk-insert", "author": "Garrett"} +{"plain": "<@996429686559211562> Thank you! I'll try this and get back to you", "explain": "Let us know if you have any other questions, happy to help <@964740976759414805>Scroll down to header descriptions for extra detail and configuring other field types", "author": "Garrett"} +{"plain": "but I can't do this with the csv ingester (./idk/molecula-consumer-csv) right?", "explain": "You can use this section in the docs for reference: https://docs.featurebase.com/community/community-data-ingestion/ingester-configuration#csv-ingesterNote, the header is written by column position within your CSVIn this case, you would use your header spec to indicate uuid should be a string, then assign that same name to the primary key field seen above`/molecula-consumer-csv \\\n --index \\\n --primary-key-fields uuid \\\n --files \\\n --header \"uuid__String_F,test__String_F,user_id__ID\"`Ah, no you cannot, one moment", "author": "Garrett"} +{"plain": "is there documentation on this mechanism somewhere? I have a csv file that I want to ingest with the id being a uuid so using this mechanism will help me avoid regenerating the data and converting uuids into positive ints'keyed' tables use a lookup internally to transform the string into the required id value.Featurebase tables need their keys to be a positive int (\"_id id\" in sql datatype parlance). There is a mechanism to use a string (\"_id string\") in which case we call the table 'keyed'time for the silly question of the day. For a field to be used as the ID, the values have to be unsigned integers only? I can't use uuids, strings, negative numbers. Is this correct?", "explain": "In your case, you would ingest your UUID as a string type, then map this string to the _id position<@964740976759414805> We don't have an explicit section on this mechanism (thanks for highlighting this!), we can use the bulk insert https://docs.featurebase.com/sql-preview/sql-bulk-insert-csv to help explain however.", "author": "Garrett"} +{"plain": "if there's something that I can tell is that documentation needs some love, especially for people who are coming from a more data warehouse background and not necessary are aware of bitmaps etc. (I should be keeping some written notes and share with you guys as I go through the experience with the product)Let us know if we can help in any way!We would love that!!!and happy to share feedback and my motivation for playing around with FeatureBase when I have my demo workinghey <@964863148463095850> thanks for asking!! The answer is a fuzzy yes & no. I managed to get things up and running but I have some issues with ingestion. I had to pause my \"project\" for a few days but I'll get back to it soonHNY <@964740976759414805>!! I hope you are having a great January so far!! Curious if you were able to get things up and running?! Let us know how we can help!", "explain": "Hey there \ud83d\udc4b . Anything in particular docs-wise you'd find helpful? Feel free to post any thoughts here or in <#1024356009751355444> . We'd love any feedback you have!", "author": "gthrone"} +{"plain": "No problem at all! Have a great New Year!!Thank you <@964863148463095850> - This is the Discord channel linked for help with Featurebase. I'm so sorry to bother you.Hello <@883397073637113936>, I think you are looking for support for the Featurebase app/feedback tool. That is not us, we are a database company. I believe you will find the answer to your question here: https://feedback.featurebase.appI am currently subscribed to the Pro Plan, and need to know how to add another board with a unique link to my account. Can I do this?Hi <@964740976759414805> - Merry Christmas! The (new) sql endpoint for FeatureBase does not talk postgres wire protocol, rather is a http request. The doc at https://docs.featurebase.com/sql-preview/sql-preview discusses how to query using the command line, or direct to the API.Merry Christmas friends! I'm having a bit of a hard time figuring out how to enable the postgres endpoint. I'm trying to run featurebase on MacOS, I downloaded the binaries and I'm running \n\n./featurebase server --sql.endpoint-enabled \n\nthe server runs but I can't connect to localhost:55432 using psql, the --bind parameter is setting the http server binding address and I can't find a parameter for the psql server. What am I missing? \ud83d\ude22", "explain": "<@1060900054577201242> Hi Dan! Is this the quick start guide you found issues with? https://github.com/FeatureBaseDB/featurebase#getting-started", "author": "Garrett"} +{"plain": "No worry, thanks for the update. At least it's working \ud83d\ude42", "explain": "<@1025093714420432907> we have updated the featurebase-examples repo with several changes to implement a CSV consumer in a container. This consumer was tested with both the docker-simple and docker-cluster examples. I also put instructions on how to configure `/etc/hosts` locally to send data to the either example from the host machine.\n\nhttps://github.com/FeatureBaseDB/featurebase-examples/\n\nThese changes require a Docker network be created by the user:\n\n```\ndocker network create fbnet\n```No problem! We've been testing a solution and should have something tomorrow for you!", "author": "Kord"} +{"plain": "At least when the data is loaded I'll be able to the query speed of cluster vs single node. I'm looking to know how to get the cluster running and also figure out what to expect in terms of performance when we go to production.I tried starting a separate container for running the consumer but I got this error: `Error running command: setting up: getting schema: requesting /schema: getting host, previous err: sending request: Get \"http://localhost:10101/schema\": dial tcp 127.0.0.1:10101: connect: connection refused: fetching primary node: requesting /status: Failed all HTTP retries: sending request: Get \"http://localhost:10101/status\": dial tcp 127.0.0.1:10101: connect: connection refused\n`. It's because of it that I moved the consumer to the featurebase node so it would be able to talk to localhost:10101. I don't know enough about how the cluster work to figure out where the localhost is coming. I was starting the consumer with --featurebase-hosts to the featurebase1 node ip.", "explain": "Hope you had a good weekend!We are also working on a new release and will cut the dockers over to it if all goes well...<@1025093714420432907> reporting back here...we got caught up with pulls for a new build and I didn't get to this. <@889219187384389702> is taking a look at building a container that pulls a CSV in and runs the consumer on it, which has access to the other containers. The idea here is to get a command you can run on a local CSV that will then be inserted...Yes. I'll get on it here after this release call.Data does not need to be explicit, generic data types per column and its rough distribution will work. Example: \"I have 500MM records and a string column with cardinality around 5000, and I'd like to count the number records that have a specific combination of strings.\"Hmmm, okay thanks for the info on the separate container attempt <@1025093714420432907> ! If you have a small sample of data and common queries you can share, I can give you an idea of how performance will scale and what to expect (mainly I can highlight what components will most influence your performance given all the configurable options).", "author": "Garrett"} +{"plain": "<@284512437254750208> In `start_processes.sh` I changed `--bind $IP_ADDRESS:$BIND_PORT` to `--bind 0.0.0.0:$BIND_PORT` and I'm running `molecula-consumer-csv` from the featurebase1 node and it seem to work now. It's not ideal because the consumer running along side a server nodeno worry <@284512437254750208> thanks for the help. I also tried to run the consumer within the containers but I get another error", "explain": "<@284512437254750208> Would it be worth testing out an overlay network with the consumer in its own container?<@1025093714420432907> I think it should work OK if a separate container is started. I'll post back when I know more.", "author": "Kord"} +{"plain": "thanks <@284512437254750208> ! Your main.py worked perfectly. For a proof of concept I had used the molecula csv loader and it worked great on my loca machine but now using the cluster I get this error: `Error running command: idk.Main.Run: importing batch: starting transaction: Failed all HTTP retries: sending request: Post \"http://192.168.112.4:30101/transaction\": dial tcp 192.168.112.4:30101: i/o timeout` Do you know which timeout value I should adjust?", "explain": "<@1025093714420432907> I've deleted a message here about the issue. I erroneously stated that the cluster would report featurebase:30101, so the info I gave in the comment was not useful. It is true that in the `docker-simple` example that one could map an entry for `featurebase 127.0.0.1` in `/etc/hosts` and make it work, but for the cluster, the IPs are required for setting up the initial cluster URLs, and from my testing the bind entry needs to match those IPs. So, in a single node configuration we can bind to a hostname (featurebase) but in the multi-node deployment, we can't.\n\nWhat I found was that we were previously showing ingestion from inside a container, so what I will do is build an example of this later today and put it up on the repo for you. Sorry for any confusion!Getting confirmation from <@996429686559211562> right now...Ah. <@1025093714420432907> The consumer likely uses GRPC, which isn't exposed in this particular example...I've noticed that it's pretty random which node becomes primary.It may be that the consumer needs to talk to the primary. Look in the UI at / and see what it says for the nodes...", "author": "Kord"} +{"plain": "wow ok that's quick! thanks", "explain": "<@1025093714420432907>, updated the repo's README as well as added primary node detection for insertion, which was causing insertion issues. Let me know how it goes testing it.I am inserting data into it. Need to check that things are balancing, so give me till tomorrow to sort that around.I was able to start it but haven't tried much else. There may still be outstanding issues with this, so please let me know what you find.<@1025093714420432907> give this a shot: https://github.com/FeatureBaseDB/featurebase-examples/tree/main/docker-clusterMaybe! \ud83e\udd23 I do have confirmation that a cluster should work, so there's that...", "author": "Kord"} +{"plain": "just csv for now", "explain": "<@1025093714420432907> I've thrown together a compose file and am investigating some issues with updating the names of each container's individual configs...might have something workable tomorrow, maybe.", "author": "Kord"} +{"plain": "yes!", "explain": "Ok. Makes sense. I don't think we have that yet, but it's worth putting together. Let me hunt around and see if we have anything to support creating it. Can I safely assume you don't need Kafka ingestion and will be using SQL3 inserts?", "author": "Kord"} +{"plain": "<@342764551122714624> By any chance do you have readily available docker compose for testing the cluster? Like 3 nodes with featurebase and etcd with the working config?", "explain": "as opposed to a single instance?<@1025093714420432907> are you talking about a docker compose that starts a cluster of containers running featurebase?", "author": "Kord"} +{"plain": "ok will doyeah, for sure", "explain": "Morning <@598633222426853386> ! I'm just checking in to see how things are going and if we can help at all?Alluded to in <#1024356009751355444> , but some major structure changes to docs just went live. This should help further differentiate cloud and community pages, but for continuity, the link <@996429686559211562> posted above is now: https://docs.featurebase.com/cloud/cloud-data-ingestion/streaming-https-endpoint/tutorial-streaming-csv.<@598633222426853386> Thank you for your patience and effort we appreciate it! I'm ticketing the issue you are experiencing with the OSS version so we can provide some answers there.This guide has some good python code to jump start youhttps://docs.featurebase.com/cloud/cloud-data-ingestion/streaming-https-endpoint/tutorial-streaming-csv.", "author": "Garrett"} +{"plain": "Should I use the http endpoint injest?ok cool, yeah the queries on the hosted version were faster on the billion row dataset vs my 400k row. So something seems off", "explain": "Yes, http end pointWith your data model, these queries should be really fast", "author": "Garrett"} +{"plain": "yeah, Kinetica in particular worked way worse on the mac vs a cloud server", "explain": "I'm going to look into the new MBP issues because this looks very resource bound as opposed to FeatureBase (other than your missing value error)Okay, since I can't reproduce this readily, how about we switch back to the cloud environment and I help you get this data loaded (we can work on getting it in faster than the limit)", "author": "Garrett"} +{"plain": "htopsimilar type of issue. So this could be the same thinga couple databases I have tried were totally inoperable on the new MBP", "explain": "Oh really? The M2 chips have had issues?", "author": "Garrett"} +{"plain": "I wonder if it is pegging the diskhaha, my brew install htop was frozen until I stopped injest", "explain": "Let me see if we can get you a docker version to limit this from happeningThat file should ingest in about 1 minute in totalIt's definitely consuming all the resourcesHAHA!", "author": "Garrett"} +{"plain": "having some fragments take longer now2022-11-10T21:47:22.210609Z INFO: importing fragments took 1m47.280927542s\n2022-11-10T21:47:22.211284Z INFO: records processed 0-> (300000)\n2022-11-10T21:47:22.352049Z INFO: translating batch of 10000 took: 120.344792ms\n2022-11-10T21:47:22.384082Z INFO: making fragments for batch of 10000 took 32.048833msfeatureba 62969 lerchmo 9237w REG 1,18 24576 5458760 /Users/lerchmo/.pilosa/indexes/lookup/backends/rbf/shard.6639875/data\nfeatureba 62969 lerchmo 9238w REG 1,18 32768 5458762 /Users/lerchmo/.pilosa/indexes/lookup/backends/rbf/shard.6639875/wal\nfeatureba 62969 lerchmo 9239w REG 1,18 24576 5458766 /Users/lerchmo/.pilosa/indexes/lookup/backends/rbf/shard.2775909/data\nfeatureba 62969 lerchmo 9240w REG 1,18 32768 5458771 /Users/lerchmo/.pilosa/indexes/lookup/backends/rbf/shard.16677526/wal\nfeatureba 62969 lerchmo 9241w REG 1,18 24576 5458770 /Users/lerchmo/.pilosa/indexes/lookup/backends/rbf/shard.16677526/dataso that produces a ton of lines like : featureba 62969 lerchmo 8173w REG 1,18 32768 5456343 /Users/lerchmo/.pilosa/indexes/lookup/backends/rbf/shard.334445/wal", "explain": "Please run a `htop`, this is a memory issueOkay, that is normal, you will have file handles open for each fragment and shard", "author": "Garrett"} +{"plain": "ok one sec, just deleting the existing index", "explain": "then while featurebase is running plug that pid into: ` lsof -p `So, while this batch runs let's grab the FeatureBase pid: `ps -ax | grep featurebase`", "author": "Garrett"} +{"plain": "just the binary from featurebase-v1.1.0-community-darwin-arm64.tar.gz", "explain": "I think your OS may be limiting memory to FeatureBase, so let's find outIn the meantime, drop the batch-size down to 10000 so `--batch-size=10000` <@598633222426853386>I'm digging around now", "author": "Garrett"} +{"plain": "Apple M2 Chip 16g ram", "explain": "Hmm that should handle this file, how are you running the featurebase server? Could the file handles be limited in any way?", "author": "Garrett"} +{"plain": "mmap file: mmap 'cannot allocate memory'oh, I solved that.. the issue I am having now.. it took 8 minutes to do the last 100k batch and the server crashed", "explain": "yep", "author": "Garrett"} +{"plain": "uncompressedcsv is 500 megsand I cannot start the server, cannot allocate enough memory... this is for 400k records2022-11-10T21:08:15.038966Z INFO: importing fragments took 8m19.691006958s\n2022-11-10T21:08:15.043867Z INFO: records processed 0-> (400000)So, not sure what the issue is with my local install, but its choking on like 400k rowsis it possible to skip rows with errors? \nERROR Processing 'lookup-table.csv': 'record on line 290964: wrong number of fields'. Skipping rest of file.", "explain": "What's the environment, disk and memory wise?As for the ingest issue, 400K rows should be fasthi <@598633222426853386>, the csv-consumer is a strict on data validation, the Cloud and oss Kafka have the ability to skip bad records/values", "author": "Garrett"} +{"plain": "in the admin, it shows all of the fields as \"set\" type./molecula-consumer-csv \\\n--batch-size=100000 \\\n--id-field=hashed_id \\\n--header=hashed_id__ID,event__String_F,key__String_F,val__String_F,type__String_F \\\n--index=lookup \\\n--files=lookup-table.csvok, getting somewhere, this is my command:cool", "explain": "Run a `Select * from hash_id limit 10`This looks correct you have 4 string fields and your ID field won't show up in the tabular view", "author": "Garrett"} +{"plain": "So I am thinking to have an inval and string val, always populate string val, and if the number is numeric put it into the int column as wellultimately I will need to do range queries somehow with my lookup table > <", "explain": "That will work as an interim (and it won't take up that much additional space as ints are Bit Slice Indexed for really high compression)", "author": "Garrett"} +{"plain": "Can I \"CAST(string to int)\" in SQL or PQL ?so basically group the users, and add the int64's into the time bucketed []intsets", "explain": "Not yet, in the next community release you can though! (week or two)", "author": "Garrett"} +{"plain": "now with my user_id table, I want to add to the setsok cool", "explain": "then you'll use String or IDS", "author": "Garrett"} +{"plain": "and if their is a duplicate it skips or updates", "explain": "Updates", "author": "Garrett"} +{"plain": "so the ID is a unique int64now I would like a \"primary key\" for my lookup tableok great, I am working on that right now", "explain": "If you don't want a particular field then use `csvcol__Ignore`in that position", "author": "Garrett"} +{"plain": "oh, my bad. Right there in my downloads folder", "explain": "So structure of the flag is field_name__arg1_arg1config1_arg1config2, arguments are seperated by double underscore, configs are denoted by a single underscore, you'll do this for each column in your csv you want to ingestIt goes by sequential position in the header flag, so: `event_name__String_F_YMDH` refers to the first column in your csv and creates a field in FB named \"event_name\", after the double underscore__we tell it to ingest this as a `string `datatype, for now setting the 3rd argument to F = False for TQ for your first ingest to keep it easyThe above command will pull from the csv file you specifiy, create a table named , and ingest all the csv columns based on the information in the `--header `flagThis page has more info working with this consumer(ingester): https://docs.featurebase.com/reference/data-ingestion/ingester-configuration#csv-ingesterSo I gave you a sample command where we override the headers and use the `--header` flag to give it specific instructionsHaha! No worries", "author": "Garrett"} +{"plain": "is molecula-consumer-csv available as a binary?oh, cool", "explain": "The tarball for your version will have all the available consumersYes, you can go here: https://github.com/FeatureBaseDB/featurebase/releases/`./molecula-consumer-csv \\\n--batch-size=100000 \n--id-field=user_id \\\n--ignore-header \\\n--header=event_name__String_F_YMDH,user_id__ID,timestamp__RecordTime_2006-01-02T15:04:05Z \\\n--index=\\\n--files=featurebase-events-import.csv`", "author": "Garrett"} +{"plain": "I can do either", "explain": "I would use the CSV-consumer, it will create the table(index) for you and use the headers for much of it", "author": "Garrett"} +{"plain": "I am testing locally now so I can injest more data, so I am actually trying to figure out how to do the schema with the /index rest endpoint", "explain": "Is your data in a CSV or JSON file?One momentNow that I'm looking at it, it's not very verbose. <@342764551122714624> where is the best place to learn about cloud data types for <@598633222426853386>?", "author": "Garrett"} +{"plain": "and here I see ID, IDSETI see \"tables, databases\" vs indexesor is the software on cloud different then community? https://docs.featurebase.com/data-modeling-guide/cloud/data-typesnothing about IDSET or IDSo I am struggling to find information on the \"ID\" type for my lookup table, I see set, int, timestamp, bool, time, mutex here: https://docs.featurebase.com/reference/api/enterprise/http-api", "explain": "Please refer to this page for information on Cloud Types: https://docs.featurebase.com/data-modeling-guide/cloud/data-typesYou will see Index more widely in the OSS FeatureBase versions than cloud howeverAlso in general \"Index\" = \"Table\" throughout the codebase, Index was the first iteration, but as FeatureBase move towards SQL it is being replaced with Table naming conventionsThe endpoints are currently not the same, the ID type behaves the same but ingesting and setup are a little different", "author": "Garrett"} +{"plain": "or not user_id' but lookup_idand would I need to pull the user_id's to my app and build a new query? or can a join be done?", "explain": "OR if it's ingested as a field, run another query to grab those values (performance considerations here)Oh, you can do a join", "author": "Garrett"} +{"plain": "not SQLto Query the time quantum field, I have to use PQL correct?email, lookup... basically a hopefully compressed json index, we will see how it performs", "explain": "Both of these will retrieve all the user_ids that had those values set from t(0) to t(1) using from= and to=Or for string in the events field:\n`[]Row(event=\"Super Fun Day\" ,from='2021-10-01T22:00', to='2021-10-30T01:00')`Your query for one particular field will like the following once TQ is setup: \n`[]Row(event=12345,from='2021-10-01T22:00', to='2021-10-30T01:00')`For now, yes (it's on the SQL list)", "author": "Garrett"} +{"plain": "yeah, I could just make that a string for userID", "explain": "Exactly!", "author": "Garrett"} +{"plain": "ok, will do", "explain": "Now resolving those to emails could be a join or we might could stuff those in your user table as wellWhen you run a query on the table with user_id = id field, the output will be those user_ids (unless you wrap it in a count() or aggregate function)", "author": "Garrett"} +{"plain": "So would I have to pull the ID's out into my app, or can I do a join?might just be able to combine both of thoseand event+param+value = idset+timequantumso I was thinking user_id = id field. events = idset + timequantum", "explain": "Try leaving them separate to start which will give you another addressable column for query flexibility", "author": "Garrett"} +{"plain": "I would also have more data on the user table for reporting in the segmentation builderif someone is just looking for say, people who purchased in the last 7 days, independant of parametersI would probably also have idhash(inthash(event))and the users have. user_id, idset[inthash(Event+Parameter+value]find all of the int64's based on querying this tableSo the 2 tables are inthash(Event + Parameter + value) , event, parameter, string(value)", "explain": "> For your user_id field, I would recommend using just the `id` type rather that `idset` type unless you plan to ingest ids from a JSON array (not typical)You could have a timestamp field where you ingest last purchase timestamp (mutex) or add the time quantum to an event string", "author": "Garrett"} +{"plain": "and what my data should look like for injesting thatplease do, I am trying to find documentation for creating a table with a time quantum idset field", "explain": "<@598633222426853386> This is in a Jupyter Notebook, but this may give you more code examples to think on: https://github.com/FeatureBaseDB/featurebase-examples/blob/main/simple-cloud-webapp/data-generation/jupyternotebook-example.ipynbIs your general workflow 1. Build Audience (Dynamic Segmentation) > 2. Grab all relevant IDs from cohort > 3. Reconcile all IDs to string literal email >4. Send emails out to marketing team/campaign builder?", "author": "Garrett"} +{"plain": "So I am testing the data structure we were discussing now on the cloud. I will probably test it locally just so I Can get data in fasterSure, <@996429686559211562> it is a marketing automation use case, so sometimes I will have say 20 million rows to injest (maybe in 20 or 30 seconds) and on average probably closer to the 1000 per second (but realtime is important) the big injests are for things like email campaign blasts", "explain": "Thanks <@598633222426853386> ! Okay, marketing audience and automation has a very good fit typically, so once you load up some test data and expose us to more of your query patterns we should have more input and data modeling ideas for you. I just finished a post on utilizing the SET field more effectively with a marketing audience use-case in mind, I'd be happy to send over a copy to see if it resonates/helps.", "author": "Garrett"} +{"plain": "Question, is the only way to load data in the cloud, through the HTTP endpoint at 1000 records per second?So I am trying to wrap my head aroundSo I am trying to wrap my head around data modeling. I am trying to do segmentation on a somewhat unstructured event stream. my idea is to explode the json object into keys\n\n{user:33434, timestamp: timestamp, event: click, properties: {page:x, tests: {test_1:['active_group', 'v4']} }}\n\npage = x\ntests.test_1.active_group\ntests.test_1.v4\n\nand store them in string sets. \n\nThe types of queries I need are: users that visited a page with test.test_1.active_group = true between X dates\n\nthe json I get in properties can be anything so I can't hard code any of the keysThanks <@284512437254750208> I'll give it a try this month, sorry that I can't do it soonerSorry I didn't reply sooner. Thanks for the lengthy explanation. I did a test with `--auto-generate` and I created a numerical key to replace my AUHID and it's blazing fast to insert data (1hr to execute my complete pipeline vs 30hrs with out existing one). Next I tried to run multiple ingester but I realized this wasn't compatible with `--auto-generate` and I didn't have chance to continue yet on my FeatureBase evaluation.", "explain": "Hi <@598633222426853386>, the trial ingest endpoints are set to 1000 records per second, however these are configurable and can scale considerably to handle much higher data volumes for production. Can you tell us a little bit more about your use case and the rates you would expect?Yes, that's the only way today for trials. We can configure things based on customer needs outside of trails!", "author": "gthrone"} +{"plain": "Yup same database as before but I was trying PQL instead and getting that error", "explain": "So Sort() was recently released on a newer version of featurebase, which your database isn't currently running. Upgrading ya or moving ya to a new database would have Sort() available!", "author": "gthrone"} +{"plain": "In the cloud version this query returnsthe `file_size` field is an int type fieldIn the cloud version, this query returns an error of \"Error parsing: no arg validation for 'Sort'\". Any idea what I'm doing wrong?\n\n```\n[karambyte_file_meta]\nSort(All(), field = file_size)\n```", "explain": "Are you still working with the same database that you first created (a couple months ago)? If so, I don't believe Sort() is availableHi <@281560966162808832>, the syntax is correct, however `Sort() `is not an available SQL call in your specific version of FeatureBase Cloud", "author": "Garrett"} +{"plain": "<@996429686559211562> Thanks, I had the batch size of `1048576`, I'll decrease it and give it a try. By curiosity, I'm loading from CSV but can you get lots of performance gain by using Kafka?", "explain": "One caveat is that the IDK download and unzip won't be needed until the new build of v1.2.0-community is up, most likely early next week.<@1025093714420432907> here's an early cut of the guide for Kafka (which uses Python for inserting a record): https://github.com/FeatureBaseDB/featurebase-examples/tree/main/kafka-starter#readme<@1025093714420432907> I'm working on a Kafka guide + POSTing with Python example. Can get it to you early if you are interested.Overall, it's helpful to look at your CPU usage across all your available threads once an ingest is running. Each of the levers I mentioned will help throughput by effectively managing utilization of CPU and ideally we have nearly all of them pegged near 100% during ingest for maximum rates. Note, if you're running FeatureBase on the same machine it will be effected, that's why ingesters can be run independently on separate machines/nodes etc.We can also look at adjusting the number of keys translated per batch size by adjusting `--key-translate-batch-size` which determines how many keys are translated at a given time (separate from overall record batch-size), this can be useful if you have really long keys, it's recommend to start at value of 300,000 and increase to 800,000 using this flag<@1025093714420432907> You can get more performance with the Kafka ingester mainly due to parallezation. With the Kafka ingester one option is to partition the topic and use the `--concurrency ` flag with the FeatureBase consumer. Effectively it will automatically spawn the the number of concurrent routines you specify and pull batches (you can lower the batch size a bit and still gain an overall rate increase) Note: it does work without partitioning and will poll the brokers for which group to pull from, but partitioning will set you up for some extra performance gains later with FeatureBase. Another option using CSV ingester is to simply start (2) of the ingesters simultaneously with slightly lower batch sizes. For example, split your 74MM CSV into two files and start each ingester with one of the files, FeatureBase will be able to handle each ingester and correctly manage the keys.", "author": "Garrett"} +{"plain": "I'm attempting to load millions of records using the csv loader. One of the column is a UUID but is not unique among the dataset and twice the server issue this error: `2022-10-22T09:29:00.833358Z INFO:\u00a0 writing CreateFieldKeys response: write tcp 127.0.0.1:10101->127.0.0.1:62586: write: broken pipe` First time at around 74M rows loaded and another at 44M rows loaded. When this happen the loading stop. Is there anything I can do to pass this problem? ThanksThanks it worked! Sorry I missed that part in the documentation.", "explain": "Hello <@1025093714420432907>, for ingestion related questions please do post your full command line input, it helps us understand all the various flags. Typically you will receive your error once you hit a 5 minute timeout on key translation. The first option is to decrease your batch sizes, usually we start with `--batch-size 1000000` for most CSV files. If you are already below this I have another option we can try.", "author": "Garrett"} +{"plain": "Hi everybody we will have some planned downtime for the cloud service today. We are planning a maintenance window from 7 pm to 9 pm CST for the prod environment. This maintenance will only be impacting the UI and any API related functionality (query, ingest) will remain unaffected.", "explain": "Hey <@1025093714420432907>! Just checking to see if this helped or if we need to check on something else for you.", "author": "Garrett"} +{"plain": "If I hit localhost:10101/sql like the example on the website I get a 404. I tried the basic query like \"SELECT 1;\" and it always return 404. The `featurebase cli` also doesn't work for me. Is there something special to do to get sql support?hi,For Cloud can data sources be modifiedFor Cloud, can data sources be modified in place? I'd like to add fields to the Column Map for an endpoint", "explain": "More info here on various config flags: https://docs.featurebase.com/data-querying/sql-previewHi <@1025093714420432907> have you enabled SQL using the configuration flag? `sql.endpoint-enabled`", "author": "Garrett"} +{"plain": "The FeatureBase Cloud documentation mentions that the API keys you get from id.featurebase.com are short-lived but can be refreshed, but I don't see any documentation on how you refresh them. \n\nIs there documentation on a refresh endpoint and is there one I can query to check if my current token is still valid or not?Yeah definitely, thank you so much <@964176792682369054>I'm so glad it was helpful!!I do think this was a fantastic resource to have come across.\nHaving some pretty in-depth examples to work with was awesomeNothing in particular, I'm just throwing things at the wall and seeing what sticks \ud83d\ude42<@281560966162808832> was just looking at the SQL you posted above - we have some nascent support in our new SQL implementation for select as a source, but hadn't gone as far as exposing this declaratively as CTEs yet - if you had an opinion on where we should focus SQL efforts next, I'd love to hear it...Sounds great, glad you found that useful", "explain": "\ud83d\udc4b Just saw this but wanted to mention the company is off today. I'll swing back tomorrow to discuss but short answer is there is no documentation for this and id.featurebase.com currently doesn't support using refresh tokens. We want to build that into this endpoint, but there is another endpoint that allows you to use refresh tokens.", "author": "gthrone"} +{"plain": "Yup, heard about the new SQL stuff coming. No worries, I'm just trying to draft queries that are representative of the kinds of answers I'm looking for.\n\nThe _id is a composite \"file_sha256|version\"", "explain": "<@281560966162808832> I just wanted to give you a heads up we have a change coming into the product (tomorrow or early next week) that simplifies creating the ingest endpoint as your github page alluded to. You'll no longer have to specify data types but only the mappings. I know you're past this point, but figured I'd let ya know. We'll be continuing to work on aligning the various data types you pointed out in the coming weeks, so thanks again for the feedback!", "author": "gthrone"} +{"plain": "```\nWITH\n -- Get occurrence statistics for all rule matches.\n t1 AS (\n SELECT \n rule_name, \n COUNT(DISTINCT file_sha256) AS num_files\n FROM\n karambyte_rule_match\n ),\n -- Get all rule matches for a particular record.\n t2 AS (\n SELECT\n DISTINCT rule_matches AS rule_name\n FROM karambyte_file_meta\n WHERE file_sha256 = ''\n )\n\n-- Get occurrence statistics for each rule match present in a particular record.\nSELECT\n rule_name,\n num_files\nFROM\n t1\nINNER JOIN\n t2\nON rule_name\n```\n\nThis is probably the closest to accurate SQL query I can think of right now.\nJust with the imagining that instead of getting a count, I get all of the `file_sha256` values themselves.Actually, in this case, it would more like anything with any of those values.", "explain": "Understood. we have a new SQL engine coming into Cloud (I think we mentioned that before) this month. I'm not sure we could handle that as written, but I'll definitely look into it. Is file_sah256 not your table _id (primary key)? If it was it seems like this could be achieved a little simpler with a GROUP BY", "author": "gthrone"} +{"plain": "So then for this, does this necessitate making 2 queries? First to retrieve the set values and the second to make the WHERE ... [AND ...] query?Yup, if I omit DESC or put ASC, records will return.", "explain": "Currently yes, but it sounds like your ideal would be:\n`select fise_sha256 from karambyte_rule_match where function_hashes IN (select function_hashes from karambyte_rule_match where fise_sha256 = 'your record'`\nwith IN operating like ANDs as opposed to anything with any of those values?", "author": "gthrone"} +{"plain": "yeah I'm working in cloud and that seems to be working as I'd expect", "explain": "<@996429686559211562> Thanks! I think it's a similar story for SORT, but the ASC and DESC seems strange. Any ideas there? Eric, if you leave DESC off, do records return?`AS does work but on limited field types currently `", "author": "Garrett"} +{"plain": "I'm using a specific record to identify the set of function_hashes that I want to search for.\n\nThen I want to use that set to find other relevant records where those values are set.Good morning, I think it's the former case", "explain": "Is that based on the query above? Are you still working in cloud? We don't support aliasing with AS currently (to change soon!!) so is that actually running for ya?Note the records may have additional values set as well!You should just be able to search for those values using a WHERE clause. Note you'll need to AND each search (not an IN) to enforce that all the values are there so function_hashes = 'value1' AND function_hashes = 'value2'. That will return all records that both of those values in the SET.", "author": "gthrone"} +{"plain": "Also, when I run a SQL query like:\n\n```\nSELECT \n rule_name, \n COUNT(file_sha256) AS num_files\nFROM karambyte_rule_match\nGROUP BY rule_name\nORDER BY num_files\n```\n\nORDER BY seems to only work if I omit the sort order or set it to `ORDER BY ... ASC`. If I set `ORDER BY DESC`, I get no rows.So I have a stringset column: `function_hashes`.\n\nGiven something like `SELECT function_hashes FROM file_meta WHERE _id = ...`, I can see what `function_hashes` belong to a particular file.\n\nHow would I write a query to get all other files with overlapping sets of `function_hashes`?This seems to have worked by the way. I haven't encountered this ingest issue since.", "explain": "Morning! Do you have a particular set of function_hashes you are looking for? In that case you'd just search where those particular values are set and get all of the relevant records. Or are you trying to get all potential distinct value combinations and their corresponding records?", "author": "gthrone"} +{"plain": "Not at the moment, have a great weekend and thank you!", "explain": "Likewise!", "author": "gthrone"} +{"plain": "I think that's nicely put. If I manually request `file_sha256` I'd like to not return `_id` as they're somewhat redundant to one another for this table.", "explain": "<@281560966162808832> we found a bug that was likely causing the ingest error you saw and have pushed a fix into prod, so you shouldn't encounter it again, but let us know if you do! Any open questions remaining? I wanna make sure we didn't miss anything<@281560966162808832> apologies on the Sort() functionality, but that was recently released into featurebase but not into cloud yet. Cloud periodically does upgrades but isn't always on the latest version! We'll be doing another upgrade soon. We are still looking into the ingest and null issues pointed out. I'll let ya know as soon as I do! Appreciate the patience there \ud83d\ude4f", "author": "gthrone"} +{"plain": "I get an error running this query \"Error parsing: no arg validation for 'Sort'\"\n\n```\n[karambyte_file_meta]\nExtract(\n Sort(Row(created_at != null), field=created_at, sort-desc=true), \n Rows(created_at),\n Rows(file_sha256), \n Rows(file_magic))\n```", "explain": "To clarify, do you want to specify returning only a particular field in lieu of the keys for the table (_id)?", "author": "Garrett"} +{"plain": "My _id field is a composite of `file_sha256` and `version`. I've been kind of treating that as a uniqueness constraint but I usually only care about the `file_sha256` field.", "explain": "To just remove it from tabular view you can click the columns icon in your screen shot and click the slide bar on that particular field", "author": "Garrett"} +{"plain": "Also I'm just learning PQL so this might be misunderstanding on my part,\n\n```\n[karambyte_file_meta]\nExtract(\n Row(file_magic == null),\n Rows(file_sha256),\n Rows(file_magic))\n```\n\nThe above PQL query roughly should equate to the SQL below right?:\n\n```\nSELECT _id, file_sha256, file_magic\nFROM karambyte_file_meta\nWHERE file_magic IS NULL\n```\n\nIf so, it seems kind of strange that I still get `null` values back for the `file_magic` field.\n\nAnd is there a way in PQL to not return `_id`?", "explain": "On not returning _id, are you trying to return a different field instead? Or just want to remove that from the return view?This would be the syntax for PQL: `[karambyte_file_meta]Extract(Sort(Row(created_at != null), field=created_at, sort-desc = true), Rows(), Rows( taking a look at this now, thanks for reporting!", "author": "Garrett"} +{"plain": "hasn't shown up yet", "explain": "Ya it's not this long of a delay so maybe not...", "author": "gthrone"} +{"plain": "`'{\"records\": [{\"value\": {\"file_sha256\": \"\", \"function_va\": null, \"version\": \"0.1.1\"}}]}'`\n\nI tried sending this since function_va is another required field and got a \"success_count\": 1 in the response.\n\nAlso tried with \"file_sha256\" set to null and got a successful response.\n\nI don't yet see a response under the errors tab but I'll keep an eye out", "explain": "ya the ingest error endpoint has a little delay, but if you see that error again now, then I'm hoping it points to records potentially being sent without a key???", "author": "gthrone"} +{"plain": "Sounds good, thank you! I'm adding more logging to my function and I'll see if I get the bad records to show up again.", "explain": "Out of curiosity, have you tried sending something like '{\"records\": [{\"value\": {\"version\": \"0.1.1\", \"file_sha256\":\"\"}}]}'?", "author": "gthrone"} +{"plain": "I can replicate it so that I see the error client side, but I can't replicate the condition in which the error shows up under the sink errors like it had been yesterday.\n\n```\n#!/usr/bin/env bash\n\nset -eo pipefail\n\n\ngrep PRIMARY_FIELDS tables/function.sh\n# export PRIMARY_FIELDS=\"file_sha256.function_va.version\"\n\n\ncurl \\\n -L \\\n -X POST \\\n -H \"Authorization: Bearer ${FEATUREBASE_TOKEN}\" \\\n -H \"Content-Type: application/json\" \\\n -d '{\"records\": [{\"value\": {\"version\": \"0.1.1\"}}]}' \\\n https://data.featurebase.com/v2/sinks/fd99b9ff-2ce1-4a78-a266-1af79ab6d78b\n```\n\n`{\"code\":\"InvalidParameter\",\"message\":\"One or more records are missing the primary key or ID field 'file_sha256'\"}`", "explain": "Ok let me run this by the team. It might be a little tricky without knowing how to replicate, so let me know if you do see it again and can trace it down to a body of records!", "author": "gthrone"} +{"plain": "I don't think this really has any impact and I don't know that this is the case for certain but:\n\nI'm seeing ingest endpoint errors that look like this in the Data sources tab:\n\n`Error running command: data for primary key should not be nil or missing`\n\nLooking through logs on my worker functions, the only errors I'm seeing seem to be related to rate limits:\n\n`Rate exceeded for shard shardId-000000000000 in stream sink- under account .`\n\nMaybe when the rate limits are applied, data gets cut off? I'm also only logging when I get an \"error_count\" greater than 0.sounds great, thank you", "explain": "Hmmmm the primary key error should occur when either the record sent doesn't have the mapping to the primary key (_id) column or the mapping exists in the record but the value for the key being passed is missing/null. Are you able to replicate the ingest endpoint error?", "author": "gthrone"} +{"plain": "Ahh okay, then that makes sense. When I created the table, I set the primary_field_keys (or some option similar to that) to `file_sha256, version` which it looks like is what I'm seeing", "explain": "<@281560966162808832> I'm ticketing these internally and will have some updates for you once we've done some discoveryNailed it, so in this case _id = 1, should have returned the sha256 hash you set from that primary key field, similar to other queries you run", "author": "Garrett"} +{"plain": "Keys is set to true", "explain": "Keys are whether or not you selected a field at ingest to be the record key and translated into those _id positions, in this case the NOT NULL query should have returned those keys and not their _id since you have `Keys = True`", "author": "Garrett"} +{"plain": "What does it mean for the table to be keyed?", "explain": "Go to your Tables Tab > Click the Name of the Table (karambyte_file_meta) > And check the Summary table for Keys = True or False", "author": "Garrett"} +{"plain": "That works, I get a bunch of _id entries back", "explain": "Alright, so I'm getting a ticket in to inspect this a bit deeper on the SQL side, is this table keyed?", "author": "Garrett"} +{"plain": "It does work for an int field `file_size` that I have", "explain": "Try this in PQL: `[karambyte_file_meta]Not(Row(company_name != null))`", "author": "Garrett"} +{"plain": "`Error failed to start SQL query: mapping select: handling is expression is supported only for mutex fields`\n\n```\n \"name\": \"company_name\",\n \"createdAt\": 1664118477963951000,\n \"options\": {\n \"type\": \"mutex\",\n \"cacheType\": \"ranked\",\n \"cacheSize\": 50000,\n \"keys\": true\n```\n\nI did verify that it is a mutex field", "explain": "Gotcha, try it on an `int` field if you have one available", "author": "Garrett"} +{"plain": "yeah, the field `company_name` is a string type. \n\n```\nselect *\nfrom karambyte_file_meta\nwhere company_name != null\n```\n\nI get an error: `Error failed to start SQL query: mapping select: handling: expression must be a value`", "explain": "Try the NOT argument on your query: \n`select *\nfrom karambyte_file_meta\nwhere company_name IS NOT NULL`I may have a solution for you, testing now", "author": "Garrett"} +{"plain": "Yeah it's helped me get off to a good start. There are specific syntactic things I'm trying to drum up also like if there's a good way to query for non-null values. A clause like this doesn't seem to work in SQL land `WHERE != null`", "explain": "Are you running this choosing one of the mutex set fields?", "author": "Garrett"} +{"plain": "hi! i know the team has more resources and will be in touch tomorrow morning to share them, but in the meantime, this is a doc that i've found helpful -- might give you some starting points for sample queries! https://drive.google.com/file/d/1VCbVIM7kghFTljTmAufPaYTcIv7gjWuv/view?usp=sharingare there any places I can look up example queries?thanks <@342764551122714624> and <@1012476512978612245>, was really helpful to get a couple more eyes on what I was doing wrong", "explain": "<@281560966162808832> Did this document provide enough examples to find what you were looking for?", "author": "Garrett"} +{"plain": "You can delete the current data source and recreate it.ooh, I didn't even notice that I had doubled those upI believe something like this will work ```{\n \"name\": \"company_name\",\n \"path\": [\n \"file_info.company_name\"\n ],\n \"type\": \"string\",\n \"config\":{\n \"Mutex\": true\n }\n}``` you don't need the second config object.", "explain": "<@281560966162808832> got his data ingested! Mostly related to translation from UI to API, so I'll be raising some tickets to track these.", "author": "gthrone"} +{"plain": "Okay so I have this error popping up\n\n`Error running command: idk field type set is incompatible with featurebase field type mutex: company_name`\n\nthe first image is from the table UI\n\nand this snippet is from the configured sink's schema\n```\n { \n \"name\": \"company_name\", \n \"path\": [ \n \"file_info.company_name\" \n ], \n \"type\": \"string\", \n \"config\": { \n \"config\": { \n \"Mutex\": true \n } \n } \n }, \n```Nvm, I had copied the URL from the UI and it had `data` instead of `api` in the URLAhh actually it looks like I'm getting an error `{ \"code\": \"NotFound\", \"message\": \"Invalid request URI\" }`\n\nThough the endpoint says it is active in the UI: `curl -LH \"Authorization: ${FEATUREBASE_TOKEN}\" https://data.featurebase.com/v2/sinks/cf01d095-a0a4-403f-b004-faabfeb1dc37`", "explain": "Oh and the record you're passing \ud83d\ude4fhmmmm.... do you mind sending me the full schema and possibly a screenshot of the table from the table/columns page in the UI?", "author": "gthrone"} +{"plain": "Still familiarizing myself with the API, I'll ping that endpoint. Thank you", "explain": "Of course! I definitely want to help ya with this and get your feedback, so keep me posted!", "author": "gthrone"} +{"plain": "I'm working with the Cloud trial right now.\nI've gotten to the point of having an ingester and I published some records last night with a seemingly positive response:\n\n`{\"success_count\":1,\"error_count\":0,\"records\":[{\"status\":\"success\"}]}`\n\nBut when I query in the UI, it says there are no rows. Anything I can do to debug this?Wanted to get some opinions on the current idk tests. Right now make startup is failing because it assumes some pilosa containers exist but they are access controlled. I think it would be easy enough to create these somewhere and pull them to fix the issue; however I\u2019m wondering what would be wrong with building local containers based on the built featurebase binary. I know the idk code is a new edition to the featurebase tree so understand the current test, but I suspect it can be made fairly standalone now that it\u2019s all integrated.", "explain": "Hey there <@281560966162808832> . You can see success counts but not have data load for a couple of reasons. Have you tried calling the \"error endpoint\"? Our API doc for this endpoint is here: https://api-docs-featurebase-cloud.redoc.ly/v2#operation/getSinkErrors. Side note, but this is a big focus for the cloud team right now!", "author": "gthrone"} +{"plain": "great, thanks so much for bringing that to our attention!", "explain": "You bet!", "author": "Kord"} +{"plain": "I will make a note that we should archive the Pilosa documentation", "explain": "I see the docs were in a Molecula repo. Thank you for that!", "author": "Kord"} +{"plain": "we've fixed it so that those \"Help us improve\" links should now be working", "explain": "Cool! Thank you!", "author": "Kord"} +{"plain": "maybe we can tackle both at the same timewhile we're working on getting that link fixed, was there an edit that you wanted to suggest for that page?", "explain": "I also looked around to see if the docs were in a repository, but don't see them. I see the Pilosa Docs...Just to clarify, it's on all the pages.", "author": "Kord"} +{"plain": "are you seeing the 404 on every page?thank you for the heads up, we will look into it!", "explain": "This is the link: https://github.com/molecula/documentation/edit/main/introduction/_posts/2022-04-01-getting-started.mdOh, no. Just the link that goes to the 404.", "author": "Kord"} +{"plain": "<@284512437254750208> this might help https://wilsonmar.github.io/maximum-limits/", "explain": "Apologies if it's a known issue, but the \"Help us improve this article!\" links in the docs (bottom of page) is 404'ing.<@284512437254750208> did I see you check out the cloud trial a couple weeks back by chance??Thanks for that!", "author": "Kord"} +{"plain": "And you can \"Open in full view\" so it's not just stuck in the side bar.Hey, pinged you both in the Thread.\nI'm going to need to go back over this to pull out all the relevant information, will make my life a whole lot easier if it's all in there \ud83d\ude42The default limits for all users (or per user) are usually set in /etc/sysctl.conf in linuxThe limits are set per process, and generally they are inherited. So when you try to start featurebase from a shell under your username, whatever your limits are are inherited by the featurebase processYou probably also need to have a root shell to set itThere's a syscall processes can make to set it", "explain": "I can't even install Docker.I probably should upgrade.", "author": "Kord"} +{"plain": "You might try setting it to 65536My _guess_ is that it won't accept 200000 as a valid # of open filesDrat, I don't have a mojave mac to try to repro on", "explain": "How does the binary attempt to change it?Actually, I think it uses another way to set it.", "author": "Kord"} +{"plain": "iirc, `ulimit -a` will show the hard and soft limits on macosWhat version of macOS?", "explain": "bash-3.2$ ulimit -n 65536 200000\nbash: ulimit: open files: cannot modify limit: Invalid argumentYes, investigating version issues. This is Mojave 10.14.6", "author": "Kord"} +{"plain": "OK interesting. Are you on mac or linux?", "explain": "On Mac.", "author": "Kord"} +{"plain": "Can you try something like: `./featurebase server -d /path/to/data/dir --max-file-count=20000 --max-query-memory=200000000` ?Let me see if I can repro locally, sec", "explain": "Looks like ulimit takes soft and hard limits.bash-3.2$ featurebase server -d /etc/featurebase.conf --max-file-count=20000\n2022-09-09T17:25:44.076524Z INFO: Molecula Pilosa v3.20.0 (Aug 25 2022 7:08PM, 6562b60) go1.19\n2022-09-09T17:25:44.085596Z INFO: rbf config = &cfg.Config{MaxSize:4294967296, MaxWALSize:4294967296, MinWALCheckpointSize:1048576, MaxWALCheckpointSize:2147483648, FsyncEnabled:true, FsyncWALEnabled:true, DoAllocZero:false, CursorCacheSize:0, Logger:logger.Logger(nil), MaxDelete:65536}\n2022-09-09T17:25:44.085650Z INFO: cwd: /Users/kord/code/featurebase/featurebase-v3.20.0-darwin-amd64\n2022-09-09T17:25:44.085656Z INFO: cmd line: featurebase server -d /etc/featurebase.conf --max-file-count=20000\n2022-09-09T17:25:44.184354Z INFO: enabled Web UI at :10101\nError: running server: setting resource limits: setting open file limit: invalid argumentYes.", "author": "Kord"} +{"plain": "ah, I see it in the GH issue nowSeeing the config would help", "explain": "Here's the top of a config I tried to set the ulimit:\nname = \"featurebase\"\nbind = \"0.0.0.0:10101\"\nbind-grpc = \"0.0.0.0:20101\"\nmax-query-memory = 200000000\nmax-file-count = 20000It's the config from the install guide and...yup, on the ticket. \ud83d\ude42", "author": "Kord"} +{"plain": "sometimes if the number very large it requires root permissionwhat are you trying to set `max-file-count` to?Going through the docs to see if I can<@284512437254750208> I'm new to the FeatureBase codebase myself, but I can open a GitHub issue for you. Or if you'd like, with the config and such, and I'll get some eyes on it for you.", "explain": "Thanks for that!Doesn't affect the output no matter the values. I will say that I didn't copy it into bin. Just running it as me in a folder in my directory.The default config doesn't have that defined, but I put in this last run:\nmax-query-memory = 200000000\nmax-file-count = 2000", "author": "Kord"} diff --git a/sample_weaviate_queries.py b/sample_weaviate_queries.py index 64fe859..0cb9d0e 100644 --- a/sample_weaviate_queries.py +++ b/sample_weaviate_queries.py @@ -12,14 +12,14 @@ ) -#all_objects = client.data_object.get(class_name="Intent") -#print(all_objects) +all_objects = client.data_object.get(class_name="Support") +print(all_objects) -for distance in range(0, 10): - intents = weaviate_query({"concepts": "planets"}, "Intent", float(distance/10)) +# for distance in range(0, 10): +intents = weaviate_query({"concepts": "hello"}, "Support", float(0.8)) - if len(intents) > 6: - break + # if len(intents) > 6: +# break pp = pprint.PrettyPrinter(indent=4) pp.pprint(intents) diff --git a/templates/support.txt b/templates/support.txt new file mode 100644 index 0000000..ac8403e --- /dev/null +++ b/templates/support.txt @@ -0,0 +1,7 @@ +1. A user named $author has entered the following help prompt: +$plain + +2. Here are some messages that may be related to what the user is asking about: +$intents + +3. Your suggestion to $author: \ No newline at end of file diff --git a/weaviate_schema.json b/weaviate_schema.json index b5fad12..4de87d2 100644 --- a/weaviate_schema.json +++ b/weaviate_schema.json @@ -1,7 +1,7 @@ { "classes": [ { - "class": "Intent", + "class": "Support", "description": "Intent index.", "vectorIndexConfig": { "vectorCacheMaxObjects": 100000, @@ -45,7 +45,7 @@ "dataType": [ "string" ], - "description": "GPT-3 explaination of query.", + "description": "bot response", "name": "explain", "moduleConfig": { "text2vec-openai": { @@ -53,127 +53,6 @@ "vectorizePropertyName": false } } - }, - { - "dataType": [ - "string" - ], - "description": "Possible SQL query to run.", - "name": "sql", - "moduleConfig": { - "text2vec-openai": { - "skip": false, - "vectorizePropertyName": false - } - } - }, - { - "dataType": [ - "string" - ], - "description": "Table to query.", - "name": "table", - "moduleConfig": { - "text2vec-openai": { - "skip": false, - "vectorizePropertyName": false - } - } - }, - { - "dataType": [ - "string" - ], - "description": "Display type. One of ['text', 'bar_chart', 'pie_chart', 'table']", - "name": "display_type", - "moduleConfig": { - "text2vec-openai": { - "skip": false, - "vectorizePropertyName": false - } - } - } - ] - }, - { - "class": "SQL", - "description": "SQL index.", - "vectorIndexConfig": { - "vectorCacheMaxObjects": 100000, - "ef": 2500 - }, - "moduleConfig": { - "text2vec-openai": { - "model": "ada", - "modelVersion": "002", - "type": "text" - } - }, - "properties": [ - { - "dataType": [ - "string" - ], - "description": "User's name.", - "name": "author", - "moduleConfig": { - "text2vec-openai": { - "skip": false, - "vectorizePropertyName": false - } - } - }, - { - "dataType": [ - "string" - ], - "description": "User's plain request.", - "name": "plain", - "moduleConfig": { - "text2vec-openai": { - "skip": false, - "vectorizePropertyName": false - } - } - }, - { - "dataType": [ - "string" - ], - "description": "GPT-3 explaination of query.", - "name": "explain", - "moduleConfig": { - "text2vec-openai": { - "skip": false, - "vectorizePropertyName": false - } - } - }, - { - "dataType": [ - "string" - ], - "description": "SQL query, if any.", - "name": "sql", - "moduleConfig": { - "text2vec-openai": { - "skip": false, - "vectorizePropertyName": false - } - } - }, - { - "dataType": [ - "string" - ], - "description": "Table to query.", - "name": "table", - "moduleConfig": { - "text2vec-openai": { - "skip": false, - "vectorizePropertyName": false - } - } } ] }