Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 79 additions & 27 deletions import_food_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,13 @@ def query_counts(cursor: sqlite3.Cursor, csv_count_file: str, verbose: bool = Fa
row = [x.replace('"', '') for x in row]
table, count = row[0], int(row[1])

num_rows = cursor.execute(f'''
SELECT COUNT(*) AS {table}_count
FROM {table};
''').fetchone()[0]
try:
num_rows = cursor.execute(f'''
SELECT COUNT(*) AS {table}_count
FROM {table};
''').fetchone()[0]
except sqlite3.OperationalError as err:
print(f"Ignoring: {type(err)}: {err=}")
if verbose:
print(f"Inserted {num_rows} into {table}")
if num_rows != count:
Expand Down Expand Up @@ -107,34 +110,51 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None:
CREATE TABLE branded_food (
"fdc_id" INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id),
"brand_owner" TEXT, -- XXX Inconsistent names
"brand_name" TEXT,
"subbrand_name" TEXT,
"gtin_upc" TEXT,
"ingredients" TEXT,
"not_a_significant_source_of" TEXT,
"serving_size" REAL,
"serving_size_unit" TEXT
CHECK(serving_size_unit IN ('g', 'ml')),
-- "serving_size_unit" TEXT
-- CHECK(serving_size_unit IS NULL OR
-- serving_size_unit IN ('g', 'ml')),
"serving_size_unit" TEXT,
"household_serving_fulltext" TEXT,
"branded_food_category" TEXT,
"data_source" TEXT
CHECK(data_source IN ('GDSN', 'LI')),
"modified_date" TEXT
CHECK(modified_date IS NULL OR
modified_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1),
"available_date" TEXT
CHECK(available_date IS NULL OR
available_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1)
-- "data_source" TEXT
-- CHECK(data_source IS NULL OR data_source IN ('GDSN', 'LI')),
"data_source" TEXT,
"package_weight" TEXT,
-- "modified_date" TEXT
-- CHECK(modified_date IS NULL OR
-- modified_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1),
-- "available_date" TEXT
-- CHECK(available_date IS NULL OR
-- available_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1),
"modified_date" TEXT,
"available_date" TEXT,
"market_country" TEXT,
-- "discontinued_date" TEXT
-- CHECK(discontinued_date IS NULL OR
-- discontinued_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1)
"discontinued_date" TEXT
);

CREATE INDEX idx_branded_food_gtin_upc ON branded_food (gtin_upc);
CREATE INDEX idx_branded_food_branded_food_category ON branded_food (branded_food_category);

CREATE TABLE food (
fdc_id INT NOT NULL PRIMARY KEY,
data_type TEXT,
description TEXT,
food_category_id INT REFERENCES food_category(id),
publication_date TEXT
"fdc_id" INT NOT NULL PRIMARY KEY,
"data_type" TEXT,
"description" TEXT,
-- "food_category_id" INT REFERENCES food_category(id),
"food_category_id" INT,
"publication_date" TEXT
CHECK(publication_date IS NULL OR
publication_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1)
-- "scientific_name" TEXT
-- "food_key" TEXT
);

CREATE INDEX idx_food_data_type ON food (data_type);
Expand All @@ -159,7 +179,8 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None:
);

CREATE TABLE food_calorie_conversion_factor (
"food_nutrient_conversion_factor_id" INT NOT NULL PRIMARY KEY REFERENCES food_nutrient_conversion_factor(id),
-- "food_nutrient_conversion_factor_id" INT NOT NULL PRIMARY KEY REFERENCES food_nutrient_conversion_factor(id),
"food_nutrient_conversion_factor_id" INT NOT NULL PRIMARY KEY,
"protein_value" REAL,
"fat_value" REAL,
"carbohydrate_value" REAL
Expand Down Expand Up @@ -192,15 +213,19 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None:

CREATE TABLE food_nutrient (
"id" INT NOT NULL PRIMARY KEY,
"fdc_id" INT REFERENCES food(fdc_id),
"nutrient_id" INT REFERENCES nutrient(id),
-- "fdc_id" INT REFERENCES food(fdc_id),
"fdc_id" INT,
-- "nutrient_id" INT REFERENCES nutrient(id),
"nutrient_id" INT,
"amount" REAL,
"data_points" INT,
"derivation_id" INT REFERENCES food_nutrient_derivation(id),
-- "derivation_id" INT REFERENCES food_nutrient_derivation(id),
"derivation_id" INT,
-- XXX Missing standard_error from Field Descriptions
"min" REAL,
"max" REAL,
"median" REAL,
"loq" REAL,
"footnote" TEXT,
"min_year_acquired" TEXT
CHECK(min_year_acquired IS NULL OR
Expand Down Expand Up @@ -257,6 +282,14 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None:
"value" REAL
);

-- CREATE TABLE food_update_log_entry (
-- "fdc_id" INT REFERENCES food(fdc_id),
-- "description" TEXT,
-- "publication_date" TEXT
-- CHECK(publication_date IS NULL OR
-- publication_date GLOB '[0-9][0-9][0-9][0-9]' IS 1)
-- );

CREATE TABLE foundation_food (
"fdc_id" INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id),
"NDB_number" INT UNIQUE,
Expand Down Expand Up @@ -329,6 +362,7 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None:
CREATE TABLE measure_unit (
"id" INT NOT NULL PRIMARY KEY,
"name" TEXT UNIQUE
-- "abbreviation" TEXT
);

CREATE TABLE nutrient (
Expand Down Expand Up @@ -368,8 +402,10 @@ def sqlite3_schema(cursor: sqlite3.Cursor) -> None:
);

CREATE TABLE sub_sample_food (
"fdc_id" INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id),
"fdc_id_of_sample_food" INT REFERENCES food(fdc_id)
-- "fdc_id" INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id),
"fdc_id" INT NOT NULL PRIMARY KEY,
-- "fdc_id_of_sample_food" INT REFERENCES food(fdc_id)
"fdc_id_of_sample_food" INT
);

CREATE INDEX idx_sub_sample_food_fdc_id_of_sample_food ON sub_sample_food (fdc_id_of_sample_food);
Expand Down Expand Up @@ -410,6 +446,9 @@ def process(directory: str, database: str, force: bool = False, batch: int = 100

# Import in this order so it will not cause problems with foreign key
# constraints.
# Note that in the October 2021 dataset, you need to rename the following files:
# acquisition_samples.csv -> acquisition_sample.csv
# agricultural_samples.csv -> agricultural_acquisition.csv
ordered = [
'food_category.csv',
'food.csv',
Expand Down Expand Up @@ -443,6 +482,16 @@ def process(directory: str, database: str, force: bool = False, batch: int = 100
'food_nutrient.csv',
'sub_sample_result.csv',
'all_downloaded_table_record_counts.csv',
'fndds_derivation.csv',
'fndds_ingredient_nutrient_value.csv',
'food_update_log_entry.csv',
]

ignore = [
'all_downloaded_table_record_counts.csv',
'fndds_derivation.csv',
'fndds_ingredient_nutrient_value.csv',
'food_update_log_entry.csv',
]

# Make sure we accounted for everything
Expand All @@ -469,7 +518,7 @@ def process(directory: str, database: str, force: bool = False, batch: int = 100
sqlite3_schema(cursor)

for fname in ordered:
if fname == "all_downloaded_table_record_counts.csv":
if fname in ignore:
print(f"Skipping {fname}")
else:
print(f"Importing {fname}")
Expand Down Expand Up @@ -514,9 +563,12 @@ def main() -> None:
help='Whether to clobber output file')
parser.add_argument('-v', '--verbose', action='store_true', default=False,
help='Verbose output')
parser.add_argument('-c', '--check', action='store_true', default=False,
help="Only check the database, don't create it")
args = parser.parse_args()

process(args.directory, args.output, args.force, args.batch, args.verbose)
if not args.check:
process(args.directory, args.output, args.force, args.batch, args.verbose)
check(args.directory, args.output, args.verbose)


Expand Down
71 changes: 71 additions & 0 deletions queries.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
SELECT
f.data_type,
f.description,
kcal_per_100g,
protein_per_100g,
fiber_per_100g,
carbs_per_100g,
fats_per_100g,
(CASE WHEN f.data_type LIKE 'branded%' THEN bf.household_serving_fulltext ELSE
(CASE WHEN f.data_type LIKE 'survey%' THEN fp.portion_description ELSE
(CASE WHEN f.data_type LIKE 'founda%' THEN fp.amount || ' ' || mu.name
ELSE fp.amount || ' ' || fp.modifier END) END) END) as 'serving',
CASE WHEN f.data_type LIKE 'branded%' THEN bf.serving_size ELSE fp.gram_weight END as 'serving_size',
CASE WHEN f.data_type LIKE 'branded%' THEN bf.serving_size_unit ELSE 'g' END as 'serving_size_unit'

FROM
((((((((food f LEFT JOIN food_portion fp ON f.fdc_id = fp.fdc_id
-- this next line forces just one row of several matches (which have different serving sizes)
AND fp.id = (SELECT id FROM food_portion WHERE f.fdc_id = food_portion.fdc_id LIMIT 1))
LEFT JOIN measure_unit mu ON fp.measure_unit_id = mu.id)
LEFT JOIN branded_food bf ON f.fdc_id = bf.fdc_id)
LEFT JOIN ( SELECT fdc_id, amount AS 'kcal_per_100g' FROM food_nutrient LEFT JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id WHERE name LIKE 'energy' AND unit_name LIKE 'kcal') t1 ON t1.fdc_id = f.fdc_id)
LEFT JOIN ( SELECT fdc_id, amount AS 'protein_per_100g' FROM food_nutrient LEFT JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id WHERE name LIKE 'Protein%' AND unit_name LIKE 'G') t2 ON t2.fdc_id = f.fdc_id)
LEFT JOIN ( SELECT fdc_id, amount AS 'fiber_per_100g' FROM food_nutrient LEFT JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id WHERE name LIKE 'Fiber%' AND unit_name LIKE 'G') t3 ON t3.fdc_id = f.fdc_id)
LEFT JOIN ( SELECT fdc_id, amount AS 'carbs_per_100g' FROM food_nutrient LEFT JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id WHERE name LIKE 'Carbohydrate%' AND unit_name LIKE 'G') t4 ON t4.fdc_id = f.fdc_id)
LEFT JOIN ( SELECT fdc_id, amount AS 'fats_per_100g' FROM food_nutrient LEFT JOIN nutrient ON food_nutrient.nutrient_id = nutrient.id WHERE name LIKE 'Total lipid%' AND unit_name LIKE 'G') t5 ON t5.fdc_id = f.fdc_id)

WHERE kcal_per_100g IS NOT NULL;

/*
SELECT
f.data_type,
f.description,
fn.amount AS 'kcal_per_100g',
fn.min,
fn.max,
n.name,
n.unit_name,
fp.amount as 'portion_amount',
mu.name as 'portion_unit',
fp.portion_description,
fp.modifier as 'portion_modifier',
fp.gram_weight as 'portion_gram_weight',
bf.brand_owner,
bf.serving_size as 'branded_serving_size',
bf.serving_size_unit as 'branded_serving_unit',
bf.household_serving_fulltext as 'branded_serving_household_unit',
bf.ingredients as 'branded_ingredients'

FROM
(((((food f LEFT JOIN food_nutrient fn ON f.fdc_id = fn.fdc_id)
LEFT JOIN nutrient n ON fn.nutrient_id = n.id)
LEFT JOIN food_portion fp on f.fdc_id = fp.fdc_id)
LEFT JOIN measure_unit mu on fp.measure_unit_id = mu.id)
LEFT JOIN branded_food bf on f.fdc_id = bf.fdc_id)
WHERE
--food.description LIKE '%apple%'
--AND
n.name LIKE 'energy' AND n.unit_name LIKE 'kcal'
*/



--SELECT food.description, food_attribute.name, food_attribute.value from food inner join food_attribute on food.fdc_id = food_attribute.fdc_id

--SELECT * from food_portion

--SELECT DISTINCT food.food_category_id from food;
--SELECT food.food_category_id, food_category.description from food inner JOIN food_category ON food.food_category_id = food_category.id;

-- SELECT food.*, food_category.description FROM food LEFT JOIN food_category ON food.food_category_id = food_category.id