Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion notebook/Generate_synthetic_web_events.ipynb
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"cells":[{"cell_type":"code","source":["! python --version\n","\n","! pip install azure-eventhub==5.11.5 faker==24.2.0 pyodbc==5.1.0 --upgrade --force --quiet"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"b5f2f2be"},{"cell_type":"code","source":["import json\n","from azure.eventhub import EventHubProducerClient, EventData\n","import os\n","import socket\n","import random\n","\n","from random import randrange\n","\n","eventHubNameevents = \"<Event hub name goes here>\"\n","eventHubConnString = \"<Event hub connection string primary key goes here>\" \n","\n","producer_events = EventHubProducerClient.from_connection_string(conn_str=eventHubConnString, eventhub_name=eventHubNameevents)\n","\n","hostname = socket.gethostname()"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"4ec2ea88-591f-4277-9190-a314194ccfc9"},{"cell_type":"code","source":["from faker import Faker\n","from enum import Enum\n","import datetime\n","\n","# class syntax\n","class EVENT_TYPE(Enum):\n"," CLICK = 1\n"," IMPRESSION = 2\n","\n","productIds = [707,708,711,712,714,715,716,717,718,722,738,739,742,743,747,748,779,780,781,782,783,784,792,793,794,795,796,797,798,799,800,801,808,809,810,813,822,835,836,838,858,859,860,864,865,867,868,869,870,873,874,875,876,877,880,881,883,884,885,886,889,891,892,893,894,895,896,899,900,904,905,907,908,909,910,913,916,917,918,920,924,925,926,935,936,937,938,939,940,944,945,947,948,949,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,975,976,977,978,979,980,981,982,983,984,985,986,987,988,989,990,991,992,993,994,996,997,998,999]\n","\n","\n","def generateImpressionEvent(isAnomaly, productId):\n"," faker = Faker()\n","\n"," event = {}\n"," event[\"eventType\"] = EVENT_TYPE.IMPRESSION.name\n"," event[\"eventID\"] = faker.uuid4()\n"," \n"," start_date = datetime.date(year=2024, month=1, day=1)\n"," end_date = datetime.datetime.now()\n"," event[\"eventDate\"] = faker.date_time_between_dates(start_date, end_date).isoformat()\n","\n"," if productId:\n"," event[\"productId\"] = str(productId)\n"," else:\n"," event[\"productId\"] = faker.random_element(productIds)\n","\n"," randomizeUnsupported = randrange(100)\n"," userAgent = {}\n"," userAgent[\"platform\"] = faker.random_element([\"Windows\", \"Mac\", \"Linux\", \"iOS\", \"Android\"])\n"," if randomizeUnsupported <2:\n"," userAgent[\"browser\"] = \"Unsupported\"\n"," else:\n"," if userAgent[\"platform\"] == \"Windows\":\n"," userAgent[\"browser\"] = faker.random_element([\"Edge\", \"Chrome\", \"Firefox\", \"Safari\"])\n"," elif userAgent[\"platform\"] == \"Mac\":\n"," userAgent[\"browser\"] = faker.random_element([\"Chrome\", \"Firefox\", \"Safari\"])\n"," elif userAgent[\"platform\"] == \"Linux\":\n"," userAgent[\"browser\"] = faker.random_element([\"Chrome\", \"Firefox\"])\n"," elif userAgent[\"platform\"] == \"iOS\":\n"," userAgent[\"browser\"] = faker.random_element([\"Safari\", \"Chrome\"])\n"," elif userAgent[\"platform\"] == \"Android\":\n"," userAgent[\"browser\"] = faker.random_element([\"Chrome\", \"Firefox\"])\n"," userAgent[\"browserVersion\"] = faker.random_element([\"10.2\", \"13.6\", \"8.6\", \"8.5\", \"11.2\", \"14.6\", \"6.6\", \"4.5\"])\n"," event[\"userAgent\"] = userAgent\n","\n"," event[\"device\"] = faker.random_element([\"mobile\", \"computer\", \"tablet\", \"mobile\", \"computer\"])\n"," event[\"ip_address\"] = faker.ipv4()\n","\n"," # Adding related products\n"," extraPayload = []\n"," for i in range(randrange(1, 10)): # Random number of related products between 1 and 4\n"," relatedproduct = {\n"," \"relatedProductId\": str(faker.random_element([708, 711, 712, 714, 715])),\n"," \"relatedProductName\": faker.word(),\n"," \"relatedProductCategory\": faker.random_element([\"Electronics\", \"Books\", \"Clothing\", \"Home\", \"Toys\"])\n"," }\n"," extraPayload.append(relatedproduct)\n"," event[\"extraPayload\"] = extraPayload\n","\n"," # only set the referer for CLICK events\n"," refererPayload = {}\n"," event[\"referer\"] = refererPayload\n"," \n"," if isAnomaly:\n"," event[\"page_loading_seconds\"] = faker.random_number(4)/100\n"," else:\n"," event[\"page_loading_seconds\"] = faker.random_number(4)/1000\n"," return event\n","\n","\n","def generateClickEvent(impressionEvent, isAnomaly):\n"," faker = Faker()\n","\n"," event = {}\n"," event[\"eventType\"] = EVENT_TYPE.CLICK.name\n"," event[\"eventID\"] = impressionEvent[\"eventID\"]\n"," event[\"eventDate\"] = impressionEvent[\"eventDate\"]\n"," event[\"productId\"] = impressionEvent[\"productId\"]\n"," event[\"userAgent\"] = impressionEvent[\"userAgent\"]\n"," event[\"device\"] = impressionEvent[\"device\"]\n"," event[\"ip_address\"] = impressionEvent[\"ip_address\"]\n"," # Adding clickpath\n"," extraPayload = []\n"," for i in range(randrange(1, 10)): # Random number of clicks between 1 and 10\n"," clickpath = {\n"," \"clickType\": faker.random_element([\"button\", \"link\", \"image\", \"text\"]),\n"," \"url\": faker.url(),\n"," \"title\": faker.random_element([\"Brakes\", \"Helmets\", \"Battery\", \"Mirror\", \"Lights\"])\n"," }\n"," extraPayload.append(clickpath)\n"," event[\"extraPayload\"] = extraPayload\n","\n"," # only set the referer for CLICK events, refererPayload differs by campaign type\n"," refererPayload = {} \n"," refererPayload[\"url\"] = faker.uri()\n"," refererPayload[\"campaignType\"] = faker.random_element( [\"organic\", \"bing\", \"google\", \"facebook\", \"instagram\", \"twitter\", \"pinterest\", \"email\", \"affiliate\"])\n"," match refererPayload[\"campaignType\"]:\n"," case \"bing\"| \"google\"| \"facebook\"| \"instagram\":\n"," refererPayload[\"medium\"] = \"cpc\"\n"," refererPayload[\"adId\"] = faker.uuid4()\n"," refererPayload[\"adGroup\"] = faker.uuid4()\n"," refererPayload[\"adTitle\"] = faker.sentence()\n"," case \"twitter\":\n"," refererPayload[\"medium\"] = \"cpc\"\n"," refererPayload[\"adId\"] = faker.uuid4()\n"," case \"pinterest\":\n"," refererPayload[\"medium\"] = \"cpc\"\n"," refererPayload[\"adId\"] = faker.uuid4()\n"," case \"email\":\n"," refererPayload[\"medium\"] = \"email\"\n"," refererPayload[\"campaignId\"] = faker.uuid4()\n"," refererPayload[\"emailId\"] = faker.email()\n"," case \"affiliate\":\n"," refererPayload[\"medium\"] = \"affiliate\"\n"," refererPayload[\"affiliateId\"] = faker.uuid4()\n"," event[\"referer\"] = refererPayload\n"," \n"," if isAnomaly:\n"," event[\"page_loading_seconds\"] = faker.random_number(4)/100\n"," else:\n"," event[\"page_loading_seconds\"] = faker.random_number(4)/1000\n"," return event"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"3aeadd3c-a317-4013-b50b-62cf4c2c4b6b"},{"cell_type":"code","source":["def sendToEventsHub(jsonEvent, producer):\n"," eventString = json.dumps(jsonEvent)\n"," print(eventString) \n"," event_data_batch = producer.create_batch() \n"," event_data_batch.add(EventData(eventString)) \n"," producer.send_batch(event_data_batch)\n","\n","def generateEvents(isAnomaly = False, productId = None):\n"," try:\n"," while True:\n"," impressionEvent = generateImpressionEvent(isAnomaly, productId) \n"," sendToEventsHub(impressionEvent, producer_events)\n"," if random.randint(1, 100) > 80:\n"," clickEvent = generateClickEvent(impressionEvent, isAnomaly) \n"," sendToEventsHub(clickEvent, producer_events)\n"," except KeyboardInterrupt:\n"," producer_events.close()\n"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"1ccddd75-ee90-4998-b2ff-d28bacaa5d78"},{"cell_type":"code","source":["import datetime\n","print(datetime.datetime.now())\n","generateEvents(False, None)\n","print(datetime.datetime.now())\n"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"d1526001-31e5-47e4-b1e2-6875d8253c07"}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"name":"synapse_pyspark","language":"Python","display_name":"Synapse PySpark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"nteract-front-end@1.0.0"},"orig_nbformat":4,"widgets":{},"spark_compute":{"compute_id":"/trident/default"},"dependencies":{"environment":{"environmentId":"dd0720a1-4df6-40b3-8ca8-b357fc314571","workspaceId":"a5ffff48-2f89-4188-a99b-f88de16ed5d7"}}},"nbformat":4,"nbformat_minor":5}
{"cells":[{"cell_type":"code","source":["! python --version\n","\n","!pip install pyjwt==2.10.1 azure-eventhub==5.13.0 faker==35.0 pyodbc==5.2.0 --upgrade --force –quiet"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"b5f2f2be"},{"cell_type":"code","source":["import json\n","from azure.eventhub import EventHubProducerClient, EventData\n","import os\n","import socket\n","import random\n","\n","from random import randrange\n","\n","eventHubNameevents = \"<Event hub name goes here>\"\n","eventHubConnString = \"<Event hub connection string primary key goes here>\" \n","\n","producer_events = EventHubProducerClient.from_connection_string(conn_str=eventHubConnString, eventhub_name=eventHubNameevents)\n","\n","hostname = socket.gethostname()"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"4ec2ea88-591f-4277-9190-a314194ccfc9"},{"cell_type":"code","source":["from faker import Faker\n","from enum import Enum\n","import datetime\n","\n","# class syntax\n","class EVENT_TYPE(Enum):\n"," CLICK = 1\n"," IMPRESSION = 2\n","\n","productIds = [707,708,711,712,714,715,716,717,718,722,738,739,742,743,747,748,779,780,781,782,783,784,792,793,794,795,796,797,798,799,800,801,808,809,810,813,822,835,836,838,858,859,860,864,865,867,868,869,870,873,874,875,876,877,880,881,883,884,885,886,889,891,892,893,894,895,896,899,900,904,905,907,908,909,910,913,916,917,918,920,924,925,926,935,936,937,938,939,940,944,945,947,948,949,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,975,976,977,978,979,980,981,982,983,984,985,986,987,988,989,990,991,992,993,994,996,997,998,999]\n","\n","\n","def generateImpressionEvent(isAnomaly, productId):\n"," faker = Faker()\n","\n"," event = {}\n"," event[\"eventType\"] = EVENT_TYPE.IMPRESSION.name\n"," event[\"eventID\"] = faker.uuid4()\n"," \n"," start_date = datetime.date(year=2024, month=1, day=1)\n"," end_date = datetime.datetime.now()\n"," event[\"eventDate\"] = faker.date_time_between_dates(start_date, end_date).isoformat()\n","\n"," if productId:\n"," event[\"productId\"] = str(productId)\n"," else:\n"," event[\"productId\"] = faker.random_element(productIds)\n","\n"," randomizeUnsupported = randrange(100)\n"," userAgent = {}\n"," userAgent[\"platform\"] = faker.random_element([\"Windows\", \"Mac\", \"Linux\", \"iOS\", \"Android\"])\n"," if randomizeUnsupported <2:\n"," userAgent[\"browser\"] = \"Unsupported\"\n"," else:\n"," if userAgent[\"platform\"] == \"Windows\":\n"," userAgent[\"browser\"] = faker.random_element([\"Edge\", \"Chrome\", \"Firefox\", \"Safari\"])\n"," elif userAgent[\"platform\"] == \"Mac\":\n"," userAgent[\"browser\"] = faker.random_element([\"Chrome\", \"Firefox\", \"Safari\"])\n"," elif userAgent[\"platform\"] == \"Linux\":\n"," userAgent[\"browser\"] = faker.random_element([\"Chrome\", \"Firefox\"])\n"," elif userAgent[\"platform\"] == \"iOS\":\n"," userAgent[\"browser\"] = faker.random_element([\"Safari\", \"Chrome\"])\n"," elif userAgent[\"platform\"] == \"Android\":\n"," userAgent[\"browser\"] = faker.random_element([\"Chrome\", \"Firefox\"])\n"," userAgent[\"browserVersion\"] = faker.random_element([\"10.2\", \"13.6\", \"8.6\", \"8.5\", \"11.2\", \"14.6\", \"6.6\", \"4.5\"])\n"," event[\"userAgent\"] = userAgent\n","\n"," event[\"device\"] = faker.random_element([\"mobile\", \"computer\", \"tablet\", \"mobile\", \"computer\"])\n"," event[\"ip_address\"] = faker.ipv4()\n","\n"," # Adding related products\n"," extraPayload = []\n"," for i in range(randrange(1, 10)): # Random number of related products between 1 and 4\n"," relatedproduct = {\n"," \"relatedProductId\": str(faker.random_element([708, 711, 712, 714, 715])),\n"," \"relatedProductName\": faker.word(),\n"," \"relatedProductCategory\": faker.random_element([\"Electronics\", \"Books\", \"Clothing\", \"Home\", \"Toys\"])\n"," }\n"," extraPayload.append(relatedproduct)\n"," event[\"extraPayload\"] = extraPayload\n","\n"," # only set the referer for CLICK events\n"," refererPayload = {}\n"," event[\"referer\"] = refererPayload\n"," \n"," if isAnomaly:\n"," event[\"page_loading_seconds\"] = faker.random_number(4)/100\n"," else:\n"," event[\"page_loading_seconds\"] = faker.random_number(4)/1000\n"," return event\n","\n","\n","def generateClickEvent(impressionEvent, isAnomaly):\n"," faker = Faker()\n","\n"," event = {}\n"," event[\"eventType\"] = EVENT_TYPE.CLICK.name\n"," event[\"eventID\"] = impressionEvent[\"eventID\"]\n"," event[\"eventDate\"] = impressionEvent[\"eventDate\"]\n"," event[\"productId\"] = impressionEvent[\"productId\"]\n"," event[\"userAgent\"] = impressionEvent[\"userAgent\"]\n"," event[\"device\"] = impressionEvent[\"device\"]\n"," event[\"ip_address\"] = impressionEvent[\"ip_address\"]\n"," # Adding clickpath\n"," extraPayload = []\n"," for i in range(randrange(1, 10)): # Random number of clicks between 1 and 10\n"," clickpath = {\n"," \"clickType\": faker.random_element([\"button\", \"link\", \"image\", \"text\"]),\n"," \"url\": faker.url(),\n"," \"title\": faker.random_element([\"Brakes\", \"Helmets\", \"Battery\", \"Mirror\", \"Lights\"])\n"," }\n"," extraPayload.append(clickpath)\n"," event[\"extraPayload\"] = extraPayload\n","\n"," # only set the referer for CLICK events, refererPayload differs by campaign type\n"," refererPayload = {} \n"," refererPayload[\"url\"] = faker.uri()\n"," refererPayload[\"campaignType\"] = faker.random_element( [\"organic\", \"bing\", \"google\", \"facebook\", \"instagram\", \"twitter\", \"pinterest\", \"email\", \"affiliate\"])\n"," match refererPayload[\"campaignType\"]:\n"," case \"bing\"| \"google\"| \"facebook\"| \"instagram\":\n"," refererPayload[\"medium\"] = \"cpc\"\n"," refererPayload[\"adId\"] = faker.uuid4()\n"," refererPayload[\"adGroup\"] = faker.uuid4()\n"," refererPayload[\"adTitle\"] = faker.sentence()\n"," case \"twitter\":\n"," refererPayload[\"medium\"] = \"cpc\"\n"," refererPayload[\"adId\"] = faker.uuid4()\n"," case \"pinterest\":\n"," refererPayload[\"medium\"] = \"cpc\"\n"," refererPayload[\"adId\"] = faker.uuid4()\n"," case \"email\":\n"," refererPayload[\"medium\"] = \"email\"\n"," refererPayload[\"campaignId\"] = faker.uuid4()\n"," refererPayload[\"emailId\"] = faker.email()\n"," case \"affiliate\":\n"," refererPayload[\"medium\"] = \"affiliate\"\n"," refererPayload[\"affiliateId\"] = faker.uuid4()\n"," event[\"referer\"] = refererPayload\n"," \n"," if isAnomaly:\n"," event[\"page_loading_seconds\"] = faker.random_number(4)/100\n"," else:\n"," event[\"page_loading_seconds\"] = faker.random_number(4)/1000\n"," return event"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"3aeadd3c-a317-4013-b50b-62cf4c2c4b6b"},{"cell_type":"code","source":["def sendToEventsHub(jsonEvent, producer):\n"," eventString = json.dumps(jsonEvent)\n"," print(eventString) \n"," event_data_batch = producer.create_batch() \n"," event_data_batch.add(EventData(eventString)) \n"," producer.send_batch(event_data_batch)\n","\n","def generateEvents(isAnomaly = False, productId = None):\n"," try:\n"," while True:\n"," impressionEvent = generateImpressionEvent(isAnomaly, productId) \n"," sendToEventsHub(impressionEvent, producer_events)\n"," if random.randint(1, 100) > 80:\n"," clickEvent = generateClickEvent(impressionEvent, isAnomaly) \n"," sendToEventsHub(clickEvent, producer_events)\n"," except KeyboardInterrupt:\n"," producer_events.close()\n"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"1ccddd75-ee90-4998-b2ff-d28bacaa5d78"},{"cell_type":"code","source":["import datetime\n","print(datetime.datetime.now())\n","generateEvents(False, None)\n","print(datetime.datetime.now())\n"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"d1526001-31e5-47e4-b1e2-6875d8253c07"}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"name":"synapse_pyspark","language":"Python","display_name":"Synapse PySpark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"nteract-front-end@1.0.0"},"orig_nbformat":4,"widgets":{},"spark_compute":{"compute_id":"/trident/default"},"dependencies":{"environment":{"environmentId":"dd0720a1-4df6-40b3-8ca8-b357fc314571","workspaceId":"a5ffff48-2f89-4188-a99b-f88de16ed5d7"}}},"nbformat":4,"nbformat_minor":5}