From 44c481ea463121c3eb99bf1e3b95744811fa3e2c Mon Sep 17 00:00:00 2001 From: Danny Young Date: Mon, 10 Jan 2022 11:09:40 -0800 Subject: [PATCH 1/3] Updates to enable importing Oct 2021 data --- import_food_data.py | 103 ++++++++++++++++++++++++++++++++------------ queries.txt | 48 +++++++++++++++++++++ 2 files changed, 124 insertions(+), 27 deletions(-) create mode 100644 queries.txt diff --git a/import_food_data.py b/import_food_data.py index 39acde6..529d0da 100644 --- a/import_food_data.py +++ b/import_food_data.py @@ -73,10 +73,13 @@ def query_counts(cursor: sqlite3.Cursor, csv_count_file: str, verbose: bool = Fa row = [x.replace('"', '') for x in row] table, count = row[0], int(row[1]) - num_rows = cursor.execute(f''' -SELECT COUNT(*) AS {table}_count -FROM {table}; -''').fetchone()[0] + try: + num_rows = cursor.execute(f''' + SELECT COUNT(*) AS {table}_count + FROM {table}; + ''').fetchone()[0] + except sqlite3.OperationalError as err: + print(f"Ignoring: {type(err)}: {err=}") if verbose: print(f"Inserted {num_rows} into {table}") if num_rows != count: @@ -107,34 +110,51 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None: CREATE TABLE branded_food ( "fdc_id" INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id), "brand_owner" TEXT, -- XXX Inconsistent names + "brand_name" TEXT, + "subbrand_name" TEXT, "gtin_upc" TEXT, "ingredients" TEXT, + "not_a_significant_source_of" TEXT, "serving_size" REAL, - "serving_size_unit" TEXT - CHECK(serving_size_unit IN ('g', 'ml')), +-- "serving_size_unit" TEXT +-- CHECK(serving_size_unit IS NULL OR +-- serving_size_unit IN ('g', 'ml')), + "serving_size_unit" TEXT, "household_serving_fulltext" TEXT, "branded_food_category" TEXT, - "data_source" TEXT - CHECK(data_source IN ('GDSN', 'LI')), - "modified_date" TEXT - CHECK(modified_date IS NULL OR - modified_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1), - "available_date" TEXT - CHECK(available_date IS NULL OR - available_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1) +-- "data_source" TEXT +-- CHECK(data_source IS NULL OR data_source IN ('GDSN', 'LI')), + "data_source" TEXT, + "package_weight" TEXT, +-- "modified_date" TEXT +-- CHECK(modified_date IS NULL OR +-- modified_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1), +-- "available_date" TEXT +-- CHECK(available_date IS NULL OR +-- available_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1), + "modified_date" TEXT, + "available_date" TEXT, + "market_country" TEXT, +-- "discontinued_date" TEXT +-- CHECK(discontinued_date IS NULL OR +-- discontinued_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1) + "discontinued_date" TEXT ); CREATE INDEX idx_branded_food_gtin_upc ON branded_food (gtin_upc); CREATE INDEX idx_branded_food_branded_food_category ON branded_food (branded_food_category); CREATE TABLE food ( - fdc_id INT NOT NULL PRIMARY KEY, - data_type TEXT, - description TEXT, - food_category_id INT REFERENCES food_category(id), - publication_date TEXT + "fdc_id" INT NOT NULL PRIMARY KEY, + "data_type" TEXT, + "description" TEXT, +-- "food_category_id" INT REFERENCES food_category(id), + "food_category_id" INT, + "publication_date" TEXT CHECK(publication_date IS NULL OR publication_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1) +-- "scientific_name" TEXT +-- "food_key" TEXT ); CREATE INDEX idx_food_data_type ON food (data_type); @@ -159,7 +179,8 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None: ); CREATE TABLE food_calorie_conversion_factor ( - "food_nutrient_conversion_factor_id" INT NOT NULL PRIMARY KEY REFERENCES food_nutrient_conversion_factor(id), +-- "food_nutrient_conversion_factor_id" INT NOT NULL PRIMARY KEY REFERENCES food_nutrient_conversion_factor(id), + "food_nutrient_conversion_factor_id" INT NOT NULL PRIMARY KEY, "protein_value" REAL, "fat_value" REAL, "carbohydrate_value" REAL @@ -192,15 +213,19 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None: CREATE TABLE food_nutrient ( "id" INT NOT NULL PRIMARY KEY, - "fdc_id" INT REFERENCES food(fdc_id), - "nutrient_id" INT REFERENCES nutrient(id), +-- "fdc_id" INT REFERENCES food(fdc_id), + "fdc_id" INT, +-- "nutrient_id" INT REFERENCES nutrient(id), + "nutrient_id" INT, "amount" REAL, "data_points" INT, - "derivation_id" INT REFERENCES food_nutrient_derivation(id), +-- "derivation_id" INT REFERENCES food_nutrient_derivation(id), + "derivation_id" INT, -- XXX Missing standard_error from Field Descriptions "min" REAL, "max" REAL, "median" REAL, + "loq" REAL, "footnote" TEXT, "min_year_acquired" TEXT CHECK(min_year_acquired IS NULL OR @@ -257,6 +282,14 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None: "value" REAL ); +-- CREATE TABLE food_update_log_entry ( +-- "fdc_id" INT REFERENCES food(fdc_id), +-- "description" TEXT, +-- "publication_date" TEXT +-- CHECK(publication_date IS NULL OR +-- publication_date GLOB '[0-9][0-9][0-9][0-9]' IS 1) +-- ); + CREATE TABLE foundation_food ( "fdc_id" INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id), "NDB_number" INT UNIQUE, @@ -329,6 +362,7 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None: CREATE TABLE measure_unit ( "id" INT NOT NULL PRIMARY KEY, "name" TEXT UNIQUE +-- "abbreviation" TEXT ); CREATE TABLE nutrient ( @@ -368,8 +402,10 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None: ); CREATE TABLE sub_sample_food ( - "fdc_id" INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id), - "fdc_id_of_sample_food" INT REFERENCES food(fdc_id) +-- "fdc_id" INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id), + "fdc_id" INT NOT NULL PRIMARY KEY, +-- "fdc_id_of_sample_food" INT REFERENCES food(fdc_id) + "fdc_id_of_sample_food" INT ); CREATE INDEX idx_sub_sample_food_fdc_id_of_sample_food ON sub_sample_food (fdc_id_of_sample_food); @@ -443,6 +479,16 @@ def process(directory: str, database: str, force: bool = False, batch: int = 100 'food_nutrient.csv', 'sub_sample_result.csv', 'all_downloaded_table_record_counts.csv', + 'fndds_derivation.csv', + 'fndds_ingredient_nutrient_value.csv', + 'food_update_log_entry.csv', + ] + + ignore = [ + 'all_downloaded_table_record_counts.csv', + 'fndds_derivation.csv', + 'fndds_ingredient_nutrient_value.csv', + 'food_update_log_entry.csv', ] # Make sure we accounted for everything @@ -469,7 +515,7 @@ def process(directory: str, database: str, force: bool = False, batch: int = 100 sqlite3_schema(cursor) for fname in ordered: - if fname == "all_downloaded_table_record_counts.csv": + if fname in ignore: print(f"Skipping {fname}") else: print(f"Importing {fname}") @@ -514,9 +560,12 @@ def main() -> None: help='Whether to clobber output file') parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Verbose output') + parser.add_argument('-c', '--check', action='store_true', default=False, + help="Only check the database, don't create it") args = parser.parse_args() - process(args.directory, args.output, args.force, args.batch, args.verbose) + if not args.check: + process(args.directory, args.output, args.force, args.batch, args.verbose) check(args.directory, args.output, args.verbose) diff --git a/queries.txt b/queries.txt new file mode 100644 index 0000000..a236d60 --- /dev/null +++ b/queries.txt @@ -0,0 +1,48 @@ +--SELECT food.data_type, food.description, food_nutrient.amount AS 'amount per 100g', food_nutrient.min, food_nutrient.max, nutrient.name, nutrient.unit_name +--FROM ((food INNER JOIN food_nutrient ON food.fdc_id = food_nutrient.fdc_id) INNER JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id) +--WHERE +--food.description LIKE 'Apple' +--AND +--nutrient.name LIKE 'energy' AND nutrient.unit_name LIKE 'kcal' + +SELECT + f.data_type, + f.description, + fn.amount AS 'kcal_per_100g', + fn.min, + fn.max, + n.name, + n.unit_name, + fp.amount as 'portion_amount', + mu.name as 'portion_unit', + fp.portion_description, + fp.modifier as 'portion_modifier', + fp.gram_weight as 'portion_gram_weight', + bf.brand_owner, + bf.serving_size as 'branded_serving_size', + bf.serving_size_unit as 'branded_serving_unit', + bf.household_serving_fulltext as 'branded_serving_household_unit', + bf.ingredients as 'branded_ingredients' + +FROM + (((((food f LEFT JOIN food_nutrient fn ON f.fdc_id = fn.fdc_id) + LEFT JOIN nutrient n ON fn.nutrient_id = n.id) + LEFT JOIN food_portion fp on f.fdc_id = fp.fdc_id) + LEFT JOIN measure_unit mu on fp.measure_unit_id = mu.id) + LEFT JOIN branded_food bf on f.fdc_id = bf.fdc_id) +WHERE +--food.description LIKE '%apple%' +--AND +n.name LIKE 'energy' AND n.unit_name LIKE 'kcal' + + + + +--SELECT food.description, food_attribute.name, food_attribute.value from food inner join food_attribute on food.fdc_id = food_attribute.fdc_id + +--SELECT * from food_portion + +--SELECT DISTINCT food.food_category_id from food; +--SELECT food.food_category_id, food_category.description from food inner JOIN food_category ON food.food_category_id = food_category.id; + +-- SELECT food.*, food_category.description FROM food LEFT JOIN food_category ON food.food_category_id = food_category.id \ No newline at end of file From 1fcb8f86f937e80286500dc6c065e6b2359a464a Mon Sep 17 00:00:00 2001 From: Danny Young Date: Mon, 10 Jan 2022 11:17:05 -0800 Subject: [PATCH 2/3] added note about 2 files in the October 2021 data that need to be renamed. --- import_food_data.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/import_food_data.py b/import_food_data.py index 529d0da..2a28594 100644 --- a/import_food_data.py +++ b/import_food_data.py @@ -446,6 +446,9 @@ def process(directory: str, database: str, force: bool = False, batch: int = 100 # Import in this order so it will not cause problems with foreign key # constraints. + # Note that in the October 2021 dataset, you need to rename the following files: + # acquisition_samples.csv -> acquisition_sample.csv + # agricultural_samples.csv -> agricultural_acquisition.csv ordered = [ 'food_category.csv', 'food.csv', From 343938ad1cd578cd272715d952f3d27b2e67671e Mon Sep 17 00:00:00 2001 From: Danny Young Date: Mon, 10 Jan 2022 18:52:08 -0800 Subject: [PATCH 3/3] queries.txt now has a nice query that creates a good flatfile --- queries.txt | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/queries.txt b/queries.txt index a236d60..0c915c0 100644 --- a/queries.txt +++ b/queries.txt @@ -1,10 +1,33 @@ ---SELECT food.data_type, food.description, food_nutrient.amount AS 'amount per 100g', food_nutrient.min, food_nutrient.max, nutrient.name, nutrient.unit_name ---FROM ((food INNER JOIN food_nutrient ON food.fdc_id = food_nutrient.fdc_id) INNER JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id) ---WHERE ---food.description LIKE 'Apple' ---AND ---nutrient.name LIKE 'energy' AND nutrient.unit_name LIKE 'kcal' +SELECT + f.data_type, + f.description, + kcal_per_100g, + protein_per_100g, + fiber_per_100g, + carbs_per_100g, + fats_per_100g, + (CASE WHEN f.data_type LIKE 'branded%' THEN bf.household_serving_fulltext ELSE + (CASE WHEN f.data_type LIKE 'survey%' THEN fp.portion_description ELSE + (CASE WHEN f.data_type LIKE 'founda%' THEN fp.amount || ' ' || mu.name + ELSE fp.amount || ' ' || fp.modifier END) END) END) as 'serving', + CASE WHEN f.data_type LIKE 'branded%' THEN bf.serving_size ELSE fp.gram_weight END as 'serving_size', + CASE WHEN f.data_type LIKE 'branded%' THEN bf.serving_size_unit ELSE 'g' END as 'serving_size_unit' + +FROM + ((((((((food f LEFT JOIN food_portion fp ON f.fdc_id = fp.fdc_id + -- this next line forces just one row of several matches (which have different serving sizes) + AND fp.id = (SELECT id FROM food_portion WHERE f.fdc_id = food_portion.fdc_id LIMIT 1)) + LEFT JOIN measure_unit mu ON fp.measure_unit_id = mu.id) + LEFT JOIN branded_food bf ON f.fdc_id = bf.fdc_id) + LEFT JOIN ( SELECT fdc_id, amount AS 'kcal_per_100g' FROM food_nutrient LEFT JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id WHERE name LIKE 'energy' AND unit_name LIKE 'kcal') t1 ON t1.fdc_id = f.fdc_id) + LEFT JOIN ( SELECT fdc_id, amount AS 'protein_per_100g' FROM food_nutrient LEFT JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id WHERE name LIKE 'Protein%' AND unit_name LIKE 'G') t2 ON t2.fdc_id = f.fdc_id) + LEFT JOIN ( SELECT fdc_id, amount AS 'fiber_per_100g' FROM food_nutrient LEFT JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id WHERE name LIKE 'Fiber%' AND unit_name LIKE 'G') t3 ON t3.fdc_id = f.fdc_id) + LEFT JOIN ( SELECT fdc_id, amount AS 'carbs_per_100g' FROM food_nutrient LEFT JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id WHERE name LIKE 'Carbohydrate%' AND unit_name LIKE 'G') t4 ON t4.fdc_id = f.fdc_id) + LEFT JOIN ( SELECT fdc_id, amount AS 'fats_per_100g' FROM food_nutrient LEFT JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id WHERE name LIKE 'Total lipid%' AND unit_name LIKE 'G') t5 ON t5.fdc_id = f.fdc_id) +WHERE kcal_per_100g IS NOT NULL; + +/* SELECT f.data_type, f.description, @@ -34,7 +57,7 @@ WHERE --food.description LIKE '%apple%' --AND n.name LIKE 'energy' AND n.unit_name LIKE 'kcal' - +*/