diff --git a/algorithm.ipynb b/algorithm.ipynb index 7776294..0c3cbf5 100644 --- a/algorithm.ipynb +++ b/algorithm.ipynb @@ -77,9 +77,23 @@ "# Under the hood the library uses findspark to initialise\n", "# Spark's environment. pyspark imports will be available \n", "# after initialisation\n", - "spark = get_session(type='regular', app_name=\"ImageRec-DEV Training\")\n", + "# spark = get_session(type='regular', extra_settings={'spark.driver.maxResultSize': '2048M'})\n", + "import os\n", + "import findspark\n", + "SPARK_HOME = os.environ.get(\"SPARK_HOME\", \"/usr/lib/spark2\")\n", + "findspark.init(SPARK_HOME)\n", + "findspark._add_to_submit_args('--driver-memory 64G --conf \"spark.driver.extraJavaOptions=-XX:+UseG1GC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps\"')\n", + "master = 'yarn'\n", + "app_name = 'gmodena-imagematching-driver-highmem'\n", "import pyspark\n", - "import pyspark.sql" + "from pyspark.sql import SparkSession\n", + "\n", + "spark = (\n", + " SparkSession.builder\n", + " .master(master)\n", + " .config('spark.driver.maxResultSize', '2048M')\n", + " .appName(app_name)\n", + " ).getOrCreate()" ] }, { @@ -301,7 +315,6 @@ " LATERAL VIEW explode(labels) t AS label_lang,label_val\n", " LATERAL VIEW OUTER explode(claims) c AS claim\n", " WHERE typ='item'\n", - " AND t.label_lang='\"\"\"+label_lang+\"\"\"'\n", " AND snapshot='\"\"\"+snapshot+\"\"\"'\n", " AND claim.mainSnak.property in ('P18','P31','P373')\n", " GROUP BY id,label_val\n", @@ -597,10 +610,10 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.1" + "version": "3.7.3" }, "toc-showtags": true }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +}