From 44c481ea463121c3eb99bf1e3b95744811fa3e2c Mon Sep 17 00:00:00 2001
From: Danny Young <git@youngshome.com>
Date: Mon, 10 Jan 2022 11:09:40 -0800
Subject: [PATCH 1/3] Updates to enable importing Oct 2021 data

---
 import_food_data.py | 103 ++++++++++++++++++++++++++++++++------------
 queries.txt         |  48 +++++++++++++++++++++
 2 files changed, 124 insertions(+), 27 deletions(-)
 create mode 100644 queries.txt

diff --git a/import_food_data.py b/import_food_data.py
index 39acde6..529d0da 100644
--- a/import_food_data.py
+++ b/import_food_data.py
@@ -73,10 +73,13 @@ def query_counts(cursor: sqlite3.Cursor, csv_count_file: str, verbose: bool = Fa
             row = [x.replace('"', '') for x in row]
             table, count = row[0], int(row[1])
 
-            num_rows = cursor.execute(f'''
-SELECT COUNT(*) AS {table}_count
-FROM {table};
-''').fetchone()[0]
+            try:
+              num_rows = cursor.execute(f'''
+  SELECT COUNT(*) AS {table}_count
+  FROM {table};
+  ''').fetchone()[0]
+            except sqlite3.OperationalError as err:
+              print(f"Ignoring: {type(err)}: {err=}")
             if verbose:
                 print(f"Inserted {num_rows} into {table}")
             if num_rows != count:
@@ -107,34 +110,51 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None:
 CREATE TABLE branded_food (
   "fdc_id"                     INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id),
   "brand_owner"                TEXT,  -- XXX Inconsistent names
+  "brand_name"                 TEXT,
+  "subbrand_name"              TEXT,
   "gtin_upc"                   TEXT,
   "ingredients"                TEXT,
+  "not_a_significant_source_of" TEXT,
   "serving_size"               REAL,
-  "serving_size_unit"          TEXT
-      CHECK(serving_size_unit IN ('g', 'ml')),
+--  "serving_size_unit"          TEXT
+--      CHECK(serving_size_unit IS NULL OR
+--            serving_size_unit IN ('g', 'ml')),
+  "serving_size_unit"          TEXT,
   "household_serving_fulltext" TEXT,
   "branded_food_category"      TEXT,
-  "data_source"                TEXT
-      CHECK(data_source IN ('GDSN', 'LI')),
-  "modified_date"              TEXT
-      CHECK(modified_date IS NULL OR
-            modified_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1),
-  "available_date"             TEXT
-      CHECK(available_date IS NULL OR
-            available_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1)
+--  "data_source"                TEXT
+--      CHECK(data_source IS NULL OR data_source IN ('GDSN', 'LI')),
+  "data_source"                TEXT,
+  "package_weight"             TEXT,
+--  "modified_date"              TEXT
+--      CHECK(modified_date IS NULL OR
+--            modified_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1),
+--  "available_date"             TEXT
+--      CHECK(available_date IS NULL OR
+--            available_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1),
+  "modified_date"              TEXT,
+  "available_date"             TEXT,
+  "market_country"             TEXT,
+--  "discontinued_date"              TEXT
+--      CHECK(discontinued_date IS NULL OR
+--            discontinued_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1)
+  "discontinued_date"              TEXT
 );
 
 CREATE INDEX idx_branded_food_gtin_upc              ON branded_food (gtin_upc);
 CREATE INDEX idx_branded_food_branded_food_category ON branded_food (branded_food_category);
 
 CREATE TABLE food (
-  fdc_id           INT NOT NULL PRIMARY KEY,
-  data_type        TEXT,
-  description      TEXT,
-  food_category_id INT REFERENCES food_category(id),
-  publication_date TEXT
+  "fdc_id"           INT NOT NULL PRIMARY KEY,
+  "data_type"        TEXT,
+  "description"      TEXT,
+--  "food_category_id" INT REFERENCES food_category(id),
+  "food_category_id" INT,
+  "publication_date" TEXT
       CHECK(publication_date IS NULL OR
             publication_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1)
+--  "scientific_name"  TEXT
+--  "food_key"         TEXT
 );
 
 CREATE INDEX idx_food_data_type        ON food (data_type);
@@ -159,7 +179,8 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None:
 );
 
 CREATE TABLE food_calorie_conversion_factor (
-  "food_nutrient_conversion_factor_id" INT NOT NULL PRIMARY KEY REFERENCES food_nutrient_conversion_factor(id),
+--  "food_nutrient_conversion_factor_id" INT NOT NULL PRIMARY KEY REFERENCES food_nutrient_conversion_factor(id),
+  "food_nutrient_conversion_factor_id" INT NOT NULL PRIMARY KEY,
   "protein_value"      REAL,
   "fat_value"          REAL,
   "carbohydrate_value" REAL
@@ -192,15 +213,19 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None:
 
 CREATE TABLE food_nutrient (
   "id"                INT NOT NULL PRIMARY KEY,
-  "fdc_id"            INT REFERENCES food(fdc_id),
-  "nutrient_id"       INT REFERENCES nutrient(id),
+--  "fdc_id"            INT REFERENCES food(fdc_id),
+  "fdc_id"            INT,
+--  "nutrient_id"       INT REFERENCES nutrient(id),
+  "nutrient_id"       INT,
   "amount"            REAL,
   "data_points"       INT,
-  "derivation_id"     INT REFERENCES food_nutrient_derivation(id),
+--  "derivation_id"     INT REFERENCES food_nutrient_derivation(id),
+  "derivation_id"     INT,
   -- XXX Missing standard_error from Field Descriptions
   "min"               REAL,
   "max"               REAL,
   "median"            REAL,
+  "loq"               REAL,
   "footnote"          TEXT,
   "min_year_acquired" TEXT
       CHECK(min_year_acquired IS NULL OR
@@ -257,6 +282,14 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None:
   "value"                              REAL
 );
 
+-- CREATE TABLE food_update_log_entry (
+--   "fdc_id"               INT REFERENCES food(fdc_id),
+--   "description"          TEXT,
+--   "publication_date"   TEXT
+--       CHECK(publication_date IS NULL OR
+--             publication_date GLOB '[0-9][0-9][0-9][0-9]' IS 1)
+-- );
+
 CREATE TABLE foundation_food (
   "fdc_id"     INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id),
   "NDB_number" INT UNIQUE,
@@ -329,6 +362,7 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None:
 CREATE TABLE measure_unit (
   "id"   INT NOT NULL PRIMARY KEY,
   "name" TEXT UNIQUE
+--  "abbreviation" TEXT
 );
 
 CREATE TABLE nutrient (
@@ -368,8 +402,10 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None:
 );
 
 CREATE TABLE sub_sample_food (
-  "fdc_id"                INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id),
-  "fdc_id_of_sample_food" INT REFERENCES food(fdc_id)
+--  "fdc_id"                INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id),
+  "fdc_id"                INT NOT NULL PRIMARY KEY,
+--  "fdc_id_of_sample_food" INT REFERENCES food(fdc_id)
+  "fdc_id_of_sample_food" INT
 );
 
 CREATE INDEX idx_sub_sample_food_fdc_id_of_sample_food ON sub_sample_food (fdc_id_of_sample_food);
@@ -443,6 +479,16 @@ def process(directory: str, database: str, force: bool = False, batch: int = 100
         'food_nutrient.csv',
         'sub_sample_result.csv',
         'all_downloaded_table_record_counts.csv',
+        'fndds_derivation.csv',
+        'fndds_ingredient_nutrient_value.csv',
+        'food_update_log_entry.csv',
+    ]
+
+    ignore = [
+        'all_downloaded_table_record_counts.csv',
+        'fndds_derivation.csv',
+        'fndds_ingredient_nutrient_value.csv',
+        'food_update_log_entry.csv',
     ]
 
     # Make sure we accounted for everything
@@ -469,7 +515,7 @@ def process(directory: str, database: str, force: bool = False, batch: int = 100
         sqlite3_schema(cursor)
 
         for fname in ordered:
-            if fname == "all_downloaded_table_record_counts.csv":
+            if fname in ignore:
                 print(f"Skipping {fname}")
             else:
                 print(f"Importing {fname}")
@@ -514,9 +560,12 @@ def main() -> None:
                         help='Whether to clobber output file')
     parser.add_argument('-v', '--verbose', action='store_true', default=False,
                         help='Verbose output')
+    parser.add_argument('-c', '--check', action='store_true', default=False,
+                        help="Only check the database, don't create it")
     args = parser.parse_args()
 
-    process(args.directory, args.output, args.force, args.batch, args.verbose)
+    if not args.check:
+      process(args.directory, args.output, args.force, args.batch, args.verbose)
     check(args.directory, args.output, args.verbose)
 
 
diff --git a/queries.txt b/queries.txt
new file mode 100644
index 0000000..a236d60
--- /dev/null
+++ b/queries.txt
@@ -0,0 +1,48 @@
+--SELECT food.data_type, food.description, food_nutrient.amount AS 'amount per 100g', food_nutrient.min, food_nutrient.max, nutrient.name, nutrient.unit_name
+--FROM ((food INNER JOIN food_nutrient ON food.fdc_id = food_nutrient.fdc_id) INNER JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id) 
+--WHERE 
+--food.description LIKE 'Apple'
+--AND
+--nutrient.name LIKE 'energy' AND nutrient.unit_name LIKE 'kcal'
+
+SELECT 
+	f.data_type, 
+	f.description, 
+	fn.amount AS 'kcal_per_100g', 
+	fn.min, 
+	fn.max, 
+	n.name, 
+	n.unit_name,
+	fp.amount as 'portion_amount', 
+	mu.name as 'portion_unit',
+	fp.portion_description, 
+	fp.modifier as 'portion_modifier', 
+	fp.gram_weight as 'portion_gram_weight',
+	bf.brand_owner,
+	bf.serving_size as 'branded_serving_size',
+	bf.serving_size_unit as 'branded_serving_unit',
+	bf.household_serving_fulltext as 'branded_serving_household_unit',
+	bf.ingredients as 'branded_ingredients'
+	
+FROM 
+	(((((food f LEFT JOIN food_nutrient fn ON f.fdc_id = fn.fdc_id) 
+		   LEFT JOIN nutrient n ON fn.nutrient_id = n.id)
+		   LEFT JOIN food_portion fp on f.fdc_id = fp.fdc_id)
+		   LEFT JOIN measure_unit mu on fp.measure_unit_id = mu.id)
+		   LEFT JOIN branded_food bf on f.fdc_id = bf.fdc_id)
+WHERE 
+--food.description LIKE '%apple%'
+--AND
+n.name LIKE 'energy' AND n.unit_name LIKE 'kcal'
+
+
+
+
+--SELECT food.description, food_attribute.name, food_attribute.value from food inner join food_attribute on food.fdc_id = food_attribute.fdc_id
+
+--SELECT * from food_portion
+
+--SELECT DISTINCT food.food_category_id from food;
+--SELECT food.food_category_id, food_category.description from food inner JOIN food_category ON food.food_category_id = food_category.id;
+
+-- SELECT food.*, food_category.description FROM food LEFT JOIN food_category ON food.food_category_id = food_category.id
\ No newline at end of file

From 1fcb8f86f937e80286500dc6c065e6b2359a464a Mon Sep 17 00:00:00 2001
From: Danny Young <git@youngshome.com>
Date: Mon, 10 Jan 2022 11:17:05 -0800
Subject: [PATCH 2/3] added note about 2 files in the October 2021 data that
 need to be renamed.

---
 import_food_data.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/import_food_data.py b/import_food_data.py
index 529d0da..2a28594 100644
--- a/import_food_data.py
+++ b/import_food_data.py
@@ -446,6 +446,9 @@ def process(directory: str, database: str, force: bool = False, batch: int = 100
 
     # Import in this order so it will not cause problems with foreign key
     # constraints.
+    # Note that in the October 2021 dataset, you need to rename the following files:
+    #    acquisition_samples.csv -> acquisition_sample.csv
+    #    agricultural_samples.csv -> agricultural_acquisition.csv
     ordered = [
         'food_category.csv',
         'food.csv',

From 343938ad1cd578cd272715d952f3d27b2e67671e Mon Sep 17 00:00:00 2001
From: Danny Young <git@youngshome.com>
Date: Mon, 10 Jan 2022 18:52:08 -0800
Subject: [PATCH 3/3] queries.txt now has a nice query that creates a good
 flatfile

---
 queries.txt | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/queries.txt b/queries.txt
index a236d60..0c915c0 100644
--- a/queries.txt
+++ b/queries.txt
@@ -1,10 +1,33 @@
---SELECT food.data_type, food.description, food_nutrient.amount AS 'amount per 100g', food_nutrient.min, food_nutrient.max, nutrient.name, nutrient.unit_name
---FROM ((food INNER JOIN food_nutrient ON food.fdc_id = food_nutrient.fdc_id) INNER JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id) 
---WHERE 
---food.description LIKE 'Apple'
---AND
---nutrient.name LIKE 'energy' AND nutrient.unit_name LIKE 'kcal'
+SELECT 
+	f.data_type, 
+	f.description, 
+	kcal_per_100g,
+	protein_per_100g,
+	fiber_per_100g,
+	carbs_per_100g,
+	fats_per_100g,
+	(CASE WHEN f.data_type LIKE 'branded%' THEN bf.household_serving_fulltext ELSE 
+		(CASE WHEN f.data_type LIKE 'survey%' THEN fp.portion_description ELSE 
+		(CASE WHEN f.data_type LIKE 'founda%' THEN fp.amount || ' ' || mu.name 
+											ELSE fp.amount || ' ' || fp.modifier END) END) END)  as 'serving',
+	CASE WHEN f.data_type LIKE 'branded%' THEN bf.serving_size ELSE fp.gram_weight END as 'serving_size',
+	CASE WHEN f.data_type LIKE 'branded%' THEN bf.serving_size_unit ELSE 'g' END as 'serving_size_unit'
+	
+FROM 
+	((((((((food f LEFT JOIN food_portion fp ON f.fdc_id = fp.fdc_id
+	-- this next line forces just one row of several matches (which have different serving sizes)
+								AND fp.id = (SELECT id FROM food_portion WHERE f.fdc_id = food_portion.fdc_id LIMIT 1))
+		   LEFT JOIN measure_unit mu ON fp.measure_unit_id = mu.id)
+		   LEFT JOIN branded_food bf ON f.fdc_id = bf.fdc_id)
+		   LEFT JOIN ( SELECT fdc_id, amount AS 'kcal_per_100g' FROM food_nutrient LEFT JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id WHERE name LIKE 'energy' AND unit_name LIKE 'kcal') t1 ON t1.fdc_id = f.fdc_id)
+		   LEFT JOIN ( SELECT fdc_id, amount AS 'protein_per_100g' FROM food_nutrient LEFT JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id WHERE name LIKE 'Protein%' AND unit_name LIKE 'G') t2 ON t2.fdc_id = f.fdc_id)
+		   LEFT JOIN ( SELECT fdc_id, amount AS 'fiber_per_100g' FROM food_nutrient LEFT JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id WHERE name LIKE 'Fiber%' AND unit_name LIKE 'G') t3 ON t3.fdc_id = f.fdc_id)
+		   LEFT JOIN ( SELECT fdc_id, amount AS 'carbs_per_100g' FROM food_nutrient LEFT JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id WHERE name LIKE 'Carbohydrate%' AND unit_name LIKE 'G') t4 ON t4.fdc_id = f.fdc_id)
+		   LEFT JOIN ( SELECT fdc_id, amount AS 'fats_per_100g' FROM food_nutrient LEFT JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id WHERE name LIKE 'Total lipid%' AND unit_name LIKE 'G') t5 ON t5.fdc_id = f.fdc_id)
 
+WHERE kcal_per_100g IS NOT NULL;
+
+/*
 SELECT 
 	f.data_type, 
 	f.description, 
@@ -34,7 +57,7 @@ WHERE
 --food.description LIKE '%apple%'
 --AND
 n.name LIKE 'energy' AND n.unit_name LIKE 'kcal'
-
+*/