From 3a4013503341ae66bdeb469498825a679298000b Mon Sep 17 00:00:00 2001 From: Miho Yamamoto Date: Fri, 12 Sep 2025 11:11:19 +0900 Subject: [PATCH] Update package versions in Generate_synthetic_web_events.ipynb correct !pip command --- notebook/Generate_synthetic_web_events.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebook/Generate_synthetic_web_events.ipynb b/notebook/Generate_synthetic_web_events.ipynb index 1b9096a..92b21fd 100644 --- a/notebook/Generate_synthetic_web_events.ipynb +++ b/notebook/Generate_synthetic_web_events.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","source":["! python --version\n","\n","! pip install azure-eventhub==5.11.5 faker==24.2.0 pyodbc==5.1.0 --upgrade --force --quiet"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"b5f2f2be"},{"cell_type":"code","source":["import json\n","from azure.eventhub import EventHubProducerClient, EventData\n","import os\n","import socket\n","import random\n","\n","from random import randrange\n","\n","eventHubNameevents = \"\"\n","eventHubConnString = \"\" \n","\n","producer_events = EventHubProducerClient.from_connection_string(conn_str=eventHubConnString, eventhub_name=eventHubNameevents)\n","\n","hostname = socket.gethostname()"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"4ec2ea88-591f-4277-9190-a314194ccfc9"},{"cell_type":"code","source":["from faker import Faker\n","from enum import Enum\n","import datetime\n","\n","# class syntax\n","class EVENT_TYPE(Enum):\n"," CLICK = 1\n"," IMPRESSION = 2\n","\n","productIds = [707,708,711,712,714,715,716,717,718,722,738,739,742,743,747,748,779,780,781,782,783,784,792,793,794,795,796,797,798,799,800,801,808,809,810,813,822,835,836,838,858,859,860,864,865,867,868,869,870,873,874,875,876,877,880,881,883,884,885,886,889,891,892,893,894,895,896,899,900,904,905,907,908,909,910,913,916,917,918,920,924,925,926,935,936,937,938,939,940,944,945,947,948,949,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,975,976,977,978,979,980,981,982,983,984,985,986,987,988,989,990,991,992,993,994,996,997,998,999]\n","\n","\n","def generateImpressionEvent(isAnomaly, productId):\n"," faker = Faker()\n","\n"," event = {}\n"," event[\"eventType\"] = EVENT_TYPE.IMPRESSION.name\n"," event[\"eventID\"] = faker.uuid4()\n"," \n"," start_date = datetime.date(year=2024, month=1, day=1)\n"," end_date = datetime.datetime.now()\n"," event[\"eventDate\"] = faker.date_time_between_dates(start_date, end_date).isoformat()\n","\n"," if productId:\n"," event[\"productId\"] = str(productId)\n"," else:\n"," event[\"productId\"] = faker.random_element(productIds)\n","\n"," randomizeUnsupported = randrange(100)\n"," userAgent = {}\n"," userAgent[\"platform\"] = faker.random_element([\"Windows\", \"Mac\", \"Linux\", \"iOS\", \"Android\"])\n"," if randomizeUnsupported <2:\n"," userAgent[\"browser\"] = \"Unsupported\"\n"," else:\n"," if userAgent[\"platform\"] == \"Windows\":\n"," userAgent[\"browser\"] = faker.random_element([\"Edge\", \"Chrome\", \"Firefox\", \"Safari\"])\n"," elif userAgent[\"platform\"] == \"Mac\":\n"," userAgent[\"browser\"] = faker.random_element([\"Chrome\", \"Firefox\", \"Safari\"])\n"," elif userAgent[\"platform\"] == \"Linux\":\n"," userAgent[\"browser\"] = faker.random_element([\"Chrome\", \"Firefox\"])\n"," elif userAgent[\"platform\"] == \"iOS\":\n"," userAgent[\"browser\"] = faker.random_element([\"Safari\", \"Chrome\"])\n"," elif userAgent[\"platform\"] == \"Android\":\n"," userAgent[\"browser\"] = faker.random_element([\"Chrome\", \"Firefox\"])\n"," userAgent[\"browserVersion\"] = faker.random_element([\"10.2\", \"13.6\", \"8.6\", \"8.5\", \"11.2\", \"14.6\", \"6.6\", \"4.5\"])\n"," event[\"userAgent\"] = userAgent\n","\n"," event[\"device\"] = faker.random_element([\"mobile\", \"computer\", \"tablet\", \"mobile\", \"computer\"])\n"," event[\"ip_address\"] = faker.ipv4()\n","\n"," # Adding related products\n"," extraPayload = []\n"," for i in range(randrange(1, 10)): # Random number of related products between 1 and 4\n"," relatedproduct = {\n"," \"relatedProductId\": str(faker.random_element([708, 711, 712, 714, 715])),\n"," \"relatedProductName\": faker.word(),\n"," \"relatedProductCategory\": faker.random_element([\"Electronics\", \"Books\", \"Clothing\", \"Home\", \"Toys\"])\n"," }\n"," extraPayload.append(relatedproduct)\n"," event[\"extraPayload\"] = extraPayload\n","\n"," # only set the referer for CLICK events\n"," refererPayload = {}\n"," event[\"referer\"] = refererPayload\n"," \n"," if isAnomaly:\n"," event[\"page_loading_seconds\"] = faker.random_number(4)/100\n"," else:\n"," event[\"page_loading_seconds\"] = faker.random_number(4)/1000\n"," return event\n","\n","\n","def generateClickEvent(impressionEvent, isAnomaly):\n"," faker = Faker()\n","\n"," event = {}\n"," event[\"eventType\"] = EVENT_TYPE.CLICK.name\n"," event[\"eventID\"] = impressionEvent[\"eventID\"]\n"," event[\"eventDate\"] = impressionEvent[\"eventDate\"]\n"," event[\"productId\"] = impressionEvent[\"productId\"]\n"," event[\"userAgent\"] = impressionEvent[\"userAgent\"]\n"," event[\"device\"] = impressionEvent[\"device\"]\n"," event[\"ip_address\"] = impressionEvent[\"ip_address\"]\n"," # Adding clickpath\n"," extraPayload = []\n"," for i in range(randrange(1, 10)): # Random number of clicks between 1 and 10\n"," clickpath = {\n"," \"clickType\": faker.random_element([\"button\", \"link\", \"image\", \"text\"]),\n"," \"url\": faker.url(),\n"," \"title\": faker.random_element([\"Brakes\", \"Helmets\", \"Battery\", \"Mirror\", \"Lights\"])\n"," }\n"," extraPayload.append(clickpath)\n"," event[\"extraPayload\"] = extraPayload\n","\n"," # only set the referer for CLICK events, refererPayload differs by campaign type\n"," refererPayload = {} \n"," refererPayload[\"url\"] = faker.uri()\n"," refererPayload[\"campaignType\"] = faker.random_element( [\"organic\", \"bing\", \"google\", \"facebook\", \"instagram\", \"twitter\", \"pinterest\", \"email\", \"affiliate\"])\n"," match refererPayload[\"campaignType\"]:\n"," case \"bing\"| \"google\"| \"facebook\"| \"instagram\":\n"," refererPayload[\"medium\"] = \"cpc\"\n"," refererPayload[\"adId\"] = faker.uuid4()\n"," refererPayload[\"adGroup\"] = faker.uuid4()\n"," refererPayload[\"adTitle\"] = faker.sentence()\n"," case \"twitter\":\n"," refererPayload[\"medium\"] = \"cpc\"\n"," refererPayload[\"adId\"] = faker.uuid4()\n"," case \"pinterest\":\n"," refererPayload[\"medium\"] = \"cpc\"\n"," refererPayload[\"adId\"] = faker.uuid4()\n"," case \"email\":\n"," refererPayload[\"medium\"] = \"email\"\n"," refererPayload[\"campaignId\"] = faker.uuid4()\n"," refererPayload[\"emailId\"] = faker.email()\n"," case \"affiliate\":\n"," refererPayload[\"medium\"] = \"affiliate\"\n"," refererPayload[\"affiliateId\"] = faker.uuid4()\n"," event[\"referer\"] = refererPayload\n"," \n"," if isAnomaly:\n"," event[\"page_loading_seconds\"] = faker.random_number(4)/100\n"," else:\n"," event[\"page_loading_seconds\"] = faker.random_number(4)/1000\n"," return event"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"3aeadd3c-a317-4013-b50b-62cf4c2c4b6b"},{"cell_type":"code","source":["def sendToEventsHub(jsonEvent, producer):\n"," eventString = json.dumps(jsonEvent)\n"," print(eventString) \n"," event_data_batch = producer.create_batch() \n"," event_data_batch.add(EventData(eventString)) \n"," producer.send_batch(event_data_batch)\n","\n","def generateEvents(isAnomaly = False, productId = None):\n"," try:\n"," while True:\n"," impressionEvent = generateImpressionEvent(isAnomaly, productId) \n"," sendToEventsHub(impressionEvent, producer_events)\n"," if random.randint(1, 100) > 80:\n"," clickEvent = generateClickEvent(impressionEvent, isAnomaly) \n"," sendToEventsHub(clickEvent, producer_events)\n"," except KeyboardInterrupt:\n"," producer_events.close()\n"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"1ccddd75-ee90-4998-b2ff-d28bacaa5d78"},{"cell_type":"code","source":["import datetime\n","print(datetime.datetime.now())\n","generateEvents(False, None)\n","print(datetime.datetime.now())\n"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"d1526001-31e5-47e4-b1e2-6875d8253c07"}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"name":"synapse_pyspark","language":"Python","display_name":"Synapse PySpark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"nteract-front-end@1.0.0"},"orig_nbformat":4,"widgets":{},"spark_compute":{"compute_id":"/trident/default"},"dependencies":{"environment":{"environmentId":"dd0720a1-4df6-40b3-8ca8-b357fc314571","workspaceId":"a5ffff48-2f89-4188-a99b-f88de16ed5d7"}}},"nbformat":4,"nbformat_minor":5} \ No newline at end of file +{"cells":[{"cell_type":"code","source":["! python --version\n","\n","!pip install pyjwt==2.10.1 azure-eventhub==5.13.0 faker==35.0 pyodbc==5.2.0 --upgrade --force –quiet"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"b5f2f2be"},{"cell_type":"code","source":["import json\n","from azure.eventhub import EventHubProducerClient, EventData\n","import os\n","import socket\n","import random\n","\n","from random import randrange\n","\n","eventHubNameevents = \"\"\n","eventHubConnString = \"\" \n","\n","producer_events = EventHubProducerClient.from_connection_string(conn_str=eventHubConnString, eventhub_name=eventHubNameevents)\n","\n","hostname = socket.gethostname()"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"4ec2ea88-591f-4277-9190-a314194ccfc9"},{"cell_type":"code","source":["from faker import Faker\n","from enum import Enum\n","import datetime\n","\n","# class syntax\n","class EVENT_TYPE(Enum):\n"," CLICK = 1\n"," IMPRESSION = 2\n","\n","productIds = [707,708,711,712,714,715,716,717,718,722,738,739,742,743,747,748,779,780,781,782,783,784,792,793,794,795,796,797,798,799,800,801,808,809,810,813,822,835,836,838,858,859,860,864,865,867,868,869,870,873,874,875,876,877,880,881,883,884,885,886,889,891,892,893,894,895,896,899,900,904,905,907,908,909,910,913,916,917,918,920,924,925,926,935,936,937,938,939,940,944,945,947,948,949,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,975,976,977,978,979,980,981,982,983,984,985,986,987,988,989,990,991,992,993,994,996,997,998,999]\n","\n","\n","def generateImpressionEvent(isAnomaly, productId):\n"," faker = Faker()\n","\n"," event = {}\n"," event[\"eventType\"] = EVENT_TYPE.IMPRESSION.name\n"," event[\"eventID\"] = faker.uuid4()\n"," \n"," start_date = datetime.date(year=2024, month=1, day=1)\n"," end_date = datetime.datetime.now()\n"," event[\"eventDate\"] = faker.date_time_between_dates(start_date, end_date).isoformat()\n","\n"," if productId:\n"," event[\"productId\"] = str(productId)\n"," else:\n"," event[\"productId\"] = faker.random_element(productIds)\n","\n"," randomizeUnsupported = randrange(100)\n"," userAgent = {}\n"," userAgent[\"platform\"] = faker.random_element([\"Windows\", \"Mac\", \"Linux\", \"iOS\", \"Android\"])\n"," if randomizeUnsupported <2:\n"," userAgent[\"browser\"] = \"Unsupported\"\n"," else:\n"," if userAgent[\"platform\"] == \"Windows\":\n"," userAgent[\"browser\"] = faker.random_element([\"Edge\", \"Chrome\", \"Firefox\", \"Safari\"])\n"," elif userAgent[\"platform\"] == \"Mac\":\n"," userAgent[\"browser\"] = faker.random_element([\"Chrome\", \"Firefox\", \"Safari\"])\n"," elif userAgent[\"platform\"] == \"Linux\":\n"," userAgent[\"browser\"] = faker.random_element([\"Chrome\", \"Firefox\"])\n"," elif userAgent[\"platform\"] == \"iOS\":\n"," userAgent[\"browser\"] = faker.random_element([\"Safari\", \"Chrome\"])\n"," elif userAgent[\"platform\"] == \"Android\":\n"," userAgent[\"browser\"] = faker.random_element([\"Chrome\", \"Firefox\"])\n"," userAgent[\"browserVersion\"] = faker.random_element([\"10.2\", \"13.6\", \"8.6\", \"8.5\", \"11.2\", \"14.6\", \"6.6\", \"4.5\"])\n"," event[\"userAgent\"] = userAgent\n","\n"," event[\"device\"] = faker.random_element([\"mobile\", \"computer\", \"tablet\", \"mobile\", \"computer\"])\n"," event[\"ip_address\"] = faker.ipv4()\n","\n"," # Adding related products\n"," extraPayload = []\n"," for i in range(randrange(1, 10)): # Random number of related products between 1 and 4\n"," relatedproduct = {\n"," \"relatedProductId\": str(faker.random_element([708, 711, 712, 714, 715])),\n"," \"relatedProductName\": faker.word(),\n"," \"relatedProductCategory\": faker.random_element([\"Electronics\", \"Books\", \"Clothing\", \"Home\", \"Toys\"])\n"," }\n"," extraPayload.append(relatedproduct)\n"," event[\"extraPayload\"] = extraPayload\n","\n"," # only set the referer for CLICK events\n"," refererPayload = {}\n"," event[\"referer\"] = refererPayload\n"," \n"," if isAnomaly:\n"," event[\"page_loading_seconds\"] = faker.random_number(4)/100\n"," else:\n"," event[\"page_loading_seconds\"] = faker.random_number(4)/1000\n"," return event\n","\n","\n","def generateClickEvent(impressionEvent, isAnomaly):\n"," faker = Faker()\n","\n"," event = {}\n"," event[\"eventType\"] = EVENT_TYPE.CLICK.name\n"," event[\"eventID\"] = impressionEvent[\"eventID\"]\n"," event[\"eventDate\"] = impressionEvent[\"eventDate\"]\n"," event[\"productId\"] = impressionEvent[\"productId\"]\n"," event[\"userAgent\"] = impressionEvent[\"userAgent\"]\n"," event[\"device\"] = impressionEvent[\"device\"]\n"," event[\"ip_address\"] = impressionEvent[\"ip_address\"]\n"," # Adding clickpath\n"," extraPayload = []\n"," for i in range(randrange(1, 10)): # Random number of clicks between 1 and 10\n"," clickpath = {\n"," \"clickType\": faker.random_element([\"button\", \"link\", \"image\", \"text\"]),\n"," \"url\": faker.url(),\n"," \"title\": faker.random_element([\"Brakes\", \"Helmets\", \"Battery\", \"Mirror\", \"Lights\"])\n"," }\n"," extraPayload.append(clickpath)\n"," event[\"extraPayload\"] = extraPayload\n","\n"," # only set the referer for CLICK events, refererPayload differs by campaign type\n"," refererPayload = {} \n"," refererPayload[\"url\"] = faker.uri()\n"," refererPayload[\"campaignType\"] = faker.random_element( [\"organic\", \"bing\", \"google\", \"facebook\", \"instagram\", \"twitter\", \"pinterest\", \"email\", \"affiliate\"])\n"," match refererPayload[\"campaignType\"]:\n"," case \"bing\"| \"google\"| \"facebook\"| \"instagram\":\n"," refererPayload[\"medium\"] = \"cpc\"\n"," refererPayload[\"adId\"] = faker.uuid4()\n"," refererPayload[\"adGroup\"] = faker.uuid4()\n"," refererPayload[\"adTitle\"] = faker.sentence()\n"," case \"twitter\":\n"," refererPayload[\"medium\"] = \"cpc\"\n"," refererPayload[\"adId\"] = faker.uuid4()\n"," case \"pinterest\":\n"," refererPayload[\"medium\"] = \"cpc\"\n"," refererPayload[\"adId\"] = faker.uuid4()\n"," case \"email\":\n"," refererPayload[\"medium\"] = \"email\"\n"," refererPayload[\"campaignId\"] = faker.uuid4()\n"," refererPayload[\"emailId\"] = faker.email()\n"," case \"affiliate\":\n"," refererPayload[\"medium\"] = \"affiliate\"\n"," refererPayload[\"affiliateId\"] = faker.uuid4()\n"," event[\"referer\"] = refererPayload\n"," \n"," if isAnomaly:\n"," event[\"page_loading_seconds\"] = faker.random_number(4)/100\n"," else:\n"," event[\"page_loading_seconds\"] = faker.random_number(4)/1000\n"," return event"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"3aeadd3c-a317-4013-b50b-62cf4c2c4b6b"},{"cell_type":"code","source":["def sendToEventsHub(jsonEvent, producer):\n"," eventString = json.dumps(jsonEvent)\n"," print(eventString) \n"," event_data_batch = producer.create_batch() \n"," event_data_batch.add(EventData(eventString)) \n"," producer.send_batch(event_data_batch)\n","\n","def generateEvents(isAnomaly = False, productId = None):\n"," try:\n"," while True:\n"," impressionEvent = generateImpressionEvent(isAnomaly, productId) \n"," sendToEventsHub(impressionEvent, producer_events)\n"," if random.randint(1, 100) > 80:\n"," clickEvent = generateClickEvent(impressionEvent, isAnomaly) \n"," sendToEventsHub(clickEvent, producer_events)\n"," except KeyboardInterrupt:\n"," producer_events.close()\n"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"1ccddd75-ee90-4998-b2ff-d28bacaa5d78"},{"cell_type":"code","source":["import datetime\n","print(datetime.datetime.now())\n","generateEvents(False, None)\n","print(datetime.datetime.now())\n"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"d1526001-31e5-47e4-b1e2-6875d8253c07"}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"name":"synapse_pyspark","language":"Python","display_name":"Synapse PySpark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"nteract-front-end@1.0.0"},"orig_nbformat":4,"widgets":{},"spark_compute":{"compute_id":"/trident/default"},"dependencies":{"environment":{"environmentId":"dd0720a1-4df6-40b3-8ca8-b357fc314571","workspaceId":"a5ffff48-2f89-4188-a99b-f88de16ed5d7"}}},"nbformat":4,"nbformat_minor":5}