diff --git a/README.md b/README.md index 49ccf5a..4ad0b1e 100644 --- a/README.md +++ b/README.md @@ -12,14 +12,12 @@ fast even for huge files. ![](images/visual-demo.png) - ## Installation ```bash pip install fastapi-csv ``` - ## How to use it ### 1. From the command line @@ -29,16 +27,15 @@ an API for it, run one of: ```bash # from file -fastapi-csv people.csv +fastapi-csv dictionary.csv --delimiter "\t" # directly from URL fastapi-csv https://raw.githubusercontent.com/jrieke/fastapi-csv/main/people.csv # With options : specify the delimiter character in the CSV, the host and port for the server -fastapi-csv people.csv --delimiter "|" --host "127.0.0.1" --port 1234 +fastapi-csv dictionary.csv --delimiter "|" --host "127.0.0.1" --port 1234 ``` - Either command should start a fastapi instance, which has auto-generated endpoints and query parameters based on the CSV file. Here, the API has an endpoint `/people` (same name as the file), which can be queried using the CSV's column names, e.g. you can @@ -57,7 +54,6 @@ types, e.g. Check out the API docs for more information and an interactive demo, they should be at http://127.0.0.1:8000/docs - ### 2. From Python Create a file `my_file.py`: @@ -85,11 +81,9 @@ all the stuff you can do with a normal fastapi instance, e.g. add a new endpoint def hello(self): return {"Hello:", "World"} ``` - In the future, you will also be able to easily modify existing endpoints that were generated from the CSV file. - **Updating the data** If your CSV file changes, you can update the API data with: diff --git a/dictionary.csv b/dictionary.csv new file mode 100644 index 0000000..01a2c21 --- /dev/null +++ b/dictionary.csv @@ -0,0 +1,6 @@ +TL,TLMorph,SL,SLMorph,POS,Domain +Auto,m,car,,noun,general +Mutter,f,mother,,noun,general +Mutter,f,screw,,noun,technical +Leben,n,life,,noun,general +leben,trv;intrv,to live,,verb,general diff --git a/fastapi_csv/applications.py b/fastapi_csv/applications.py index 8cdac65..42ade58 100644 --- a/fastapi_csv/applications.py +++ b/fastapi_csv/applications.py @@ -1,19 +1,19 @@ -""" -Contains the main `FastAPI_CSV` class, which wraps `FastAPI`. -""" - -import os -from typing import Union, Type -from pathlib import Path +# Contains the main `FastAPI_CSV` class, which wraps `FastAPI`. import inspect import logging - -from fastapi import FastAPI -import fastapi -import pandas as pd +import re import sqlite3 +from pathlib import Path +from typing import Union, Type +import fastapi import numpy as np +import pandas as pd import pydantic +from fastapi import FastAPI + +def is_date_string(string: str) -> bool: + """Check if a string is a date string.""" + return re.match(r"^([0-9]{4})-(?:[0-9]{2})-([0-9]{2})$", string) is not None def create_query_param(name: str, type_: Type, default) -> pydantic.fields.ModelField: @@ -29,7 +29,6 @@ def create_query_param(name: str, type_: Type, default) -> pydantic.fields.Model ) return field - def dtype_to_type(dtype) -> Type: """Convert numpy/pandas dtype to normal Python type.""" if dtype == np.object: @@ -37,32 +36,25 @@ def dtype_to_type(dtype) -> Type: else: return type(np.zeros(1, dtype).item()) - class FastAPI_CSV(FastAPI): # TODO: Implement a way to modify auto-generated endpoints, e.g. by - # # @app.modify("/people") # def modify_people(results: List, new_query_param: str = "foo"): - # # # `results` are the dicts/json that are normally returned by the endpoint. # # Modify them as you like. # results.append({"Hello": "World"}) - # # # Any additional function args (like `new_query_param`) are added as p # # arameters to the endpoint, just like in normal fastapi. # results.append({"Hello": new_query_param}) - # # # You can also do a manual query on the database that's created from the CSV. # rows = app.query_database(f"SELECT * FROM people WHERE first_name={new_query_param}") # results.append(rows) - # # # Return modified results so they get passed to the user. # return results def __init__(self, csv_path: Union[str, Path], delimiter: Union[str, str]) -> None: """ Initializes a FastAPI instance that serves data from a CSV file. - Args: csv_path (Union[str, Path]): The path to the CSV file, can also be a URL """ @@ -70,21 +62,25 @@ def __init__(self, csv_path: Union[str, Path], delimiter: Union[str, str]) -> No # Read CSV file to pandas dataframe and create sqlite3 database from it. self.csv_path = csv_path - self.delimiter = delimiter - self.table_name = Path(self.csv_path).stem + self.table_name = Path(self.csv_path).stem.replace('-', '_') self.con = None + self.delimiter = delimiter df = self.update_database() # Add an endpoint for the CSV file with one query parameter for each column. - # We hack into fastapi a bit here to inject the query parameters at runtime - # based on the column names/types. - + # We hack into fastapi a bit here to inject the query parameters at runtime based on the column names/types. # First, define a generic endpoint method, which queries the database. def generic_get(**kwargs): + selected_cols = [] where_clauses = [] + use_distinct = False for name, val in kwargs.items(): if val is not None: - if name.endswith("_greaterThan"): + if name == "use_distinct": + use_distinct = True + elif name.endswith("_selected"): + selected_cols.append(name[:-9]) + elif name.endswith("_greaterThan"): where_clauses.append(f"{name[:-12]}>{val}") elif name.endswith("_greaterThanEqual"): where_clauses.append(f"{name[:-17]}>={val}") @@ -94,6 +90,14 @@ def generic_get(**kwargs): where_clauses.append(f"{name[:-14]}<={val}") elif name.endswith("_contains"): where_clauses.append(f"instr({name[:-9]}, '{val}') > 0") + elif name.endswith("_like"): + where_clauses.append(f"{name[:-5]} LIKE '{val}'") + elif name.endswith("_regex"): + where_clauses.append(f"{name[:-6]} REGEXP '{val}'") + elif name.endswith("_isBefore"): + where_clauses.append(f"DATE({name[:-9]}) < DATE('{val}')") + elif name.endswith("_isAfter"): + where_clauses.append(f"DATE({name[:-8]}) > DATE('{val}')") else: if isinstance(val, str): val = f"'{val}'" @@ -103,7 +107,8 @@ def generic_get(**kwargs): else: where = "" - sql_query = f"SELECT * FROM {self.table_name} {where}" + selection = ','.join(selected_cols) + sql_query = f"SELECT {'DISTINCT' if use_distinct else ''} {selection if len(selected_cols) else '*'} FROM {self.table_name} {where}" dicts = self.query_database(sql_query) return dicts @@ -114,10 +119,15 @@ def generic_get(**kwargs): # Remove all auto-generated query parameters (=one for `kwargs`). self._clear_query_params(route_path) + # Add use_distinct query param + self._add_query_param(route_path, 'use_distinct', bool) + # Add new query parameters based on column names and data types. for col, dtype in zip(df.columns, df.dtypes): type_ = dtype_to_type(dtype) self._add_query_param(route_path, col, type_) + # Use as a flag to select only given columns + self._add_query_param(route_path, col + "_selected", bool) if type_ in (int, float): self._add_query_param(route_path, col + "_greaterThan", type_) self._add_query_param(route_path, col + "_greaterThanEqual", type_) @@ -125,6 +135,11 @@ def generic_get(**kwargs): self._add_query_param(route_path, col + "_lessThanEqual", type_) elif type_ == str: self._add_query_param(route_path, col + "_contains", type_) + self._add_query_param(route_path, col + "_like", type_) + self._add_query_param(route_path, col + "_regex", type_) + if is_date_string(df[col].iloc[df[col].first_valid_index()]): + self._add_query_param(route_path, col + "_isBefore", type_) + self._add_query_param(route_path, col + "_isAfter", type_) def query_database(self, sql_query): """Executes a SQL query on the database and returns rows as list of dicts.""" @@ -135,8 +150,8 @@ def query_database(self, sql_query): def delete_database(self): """ - Deletes the database with all data read from the CSV. - + Deletes the database with all data read from the CSV. + The CSV file is not deleted of course. The API endpoints are also not affected, so you can use `update_data` to read in new data. """ @@ -150,9 +165,9 @@ def delete_database(self): def update_database(self): """ Updates the database with the current data from the CSV file. - + Note that this only affects the database, not the endpoints. If the column names - and/or data types in the CSV change (and you want that to update in the + and/or data types in the CSV change (and you want that to update in the endpoints as well), you need to create a new FastAPI_CSV object. """ self.delete_database() @@ -165,7 +180,7 @@ def update_database(self): # Download excel file from Google Sheets, read it with pandas and write to # database. - df = pd.read_csv(self.csv_path, delimiter=self.delimiter) + df = pd.read_csv(self.csv_path, delimiter=self.delimiter, engine ='python') self.con = sqlite3.connect(":memory:", check_same_thread=False) df.to_sql(self.table_name, self.con) @@ -180,6 +195,13 @@ def dict_factory(cursor, row): self.con.row_factory = dict_factory logging.info("Database successfully updated") + # Implement the REGEXP operator in SQLite. + def regexp(expr, item): + if item is None: + return False + reg = re.compile(expr) + return reg.search(item) is not None + self.con.create_function("REGEXP", 2, regexp) return df def _find_route(self, route_path): diff --git a/images/visual-demo.png b/images/visual-demo.png deleted file mode 100644 index 176229a..0000000 Binary files a/images/visual-demo.png and /dev/null differ diff --git a/people.csv b/people.csv deleted file mode 100644 index 2aaa57c..0000000 --- a/people.csv +++ /dev/null @@ -1,6 +0,0 @@ -first_name,last_name,age -Rachel,Booker,10 -Laura,Grey,24 -Mary,Jenkins,24 -Craig,Johnson,48 -Jamie,Johnson,14 diff --git a/requirements.txt b/requirements.txt index 45c22c7..a60d27c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -numpy==1.19.4 -pydantic==1.6.1 -pandas==1.1.3 -uvicorn==0.13.3 -fastapi==0.63.0 -typer==0.3.2 +numpy>=1.19.4 +pydantic>=1.6.1 +pandas>=1.1.3 +uvicorn>=0.13.3 +fastapi>=0.63.0 +typer>=0.3.2