betagouv · letconex · Mar 15, 2023 · Mar 15, 2023 · Mar 15, 2023 · Mar 16, 2023
diff --git a/README.md b/README.md
@@ -12,14 +12,12 @@ fast even for huge files.
 
 ![](images/visual-demo.png)
 
-
 ## Installation
 
 ```bash
 pip install fastapi-csv
 ```
 
-
 ## How to use it
 
 ### 1. From the command line
@@ -29,16 +27,15 @@ an API for it, run one of:
 
 ```bash
 # from file
-fastapi-csv people.csv
+fastapi-csv dictionary.csv --delimiter "\t"
 
 # directly from URL
 fastapi-csv https://raw.githubusercontent.com/jrieke/fastapi-csv/main/people.csv
 
 # With options : specify the delimiter character in the CSV, the host and port for the server
-fastapi-csv people.csv --delimiter "|" --host "127.0.0.1" --port 1234
+fastapi-csv dictionary.csv --delimiter "|" --host "127.0.0.1" --port 1234
 
 ```
-
 Either command should start a fastapi instance, which has auto-generated endpoints and
 query parameters based on the CSV file. Here, the API has an endpoint `/people`
 (same name as the file), which can be queried using the CSV's column names, e.g. you can
@@ -57,7 +54,6 @@ types, e.g.
 Check out the API docs for more information and an interactive demo, they should be at
 http://127.0.0.1:8000/docs
 
-
 ### 2. From Python
 
 Create a file `my_file.py`:
@@ -85,11 +81,9 @@ all the stuff you can do with a normal fastapi instance, e.g. add a new endpoint
 def hello(self):
     return {"Hello:", "World"}
 ```
-
 In the future, you will also be able to easily modify existing endpoints that were
 generated from the CSV file.
 
-
 **Updating the data**
 
 If your CSV file changes, you can update the API data with:

diff --git a/dictionary.csv b/dictionary.csv
@@ -0,0 +1,6 @@
+TL,TLMorph,SL,SLMorph,POS,Domain
+Auto,m,car,,noun,general
+Mutter,f,mother,,noun,general
+Mutter,f,screw,,noun,technical
+Leben,n,life,,noun,general
+leben,trv;intrv,to live,,verb,general
diff --git a/fastapi_csv/applications.py b/fastapi_csv/applications.py
@@ -1,19 +1,19 @@
-"""
-Contains the main `FastAPI_CSV` class, which wraps `FastAPI`.
-"""
-
-import os
-from typing import Union, Type
-from pathlib import Path
+# Contains the main `FastAPI_CSV` class, which wraps `FastAPI`.
 import inspect
 import logging
-
-from fastapi import FastAPI
-import fastapi
-import pandas as pd
+import re
 import sqlite3
+from pathlib import Path
+from typing import Union, Type
+import fastapi
 import numpy as np
+import pandas as pd
 import pydantic
+from fastapi import FastAPI
+
+def is_date_string(string: str) -> bool:
+    """Check if a string is a date string."""
+    return re.match(r"^([0-9]{4})-(?:[0-9]{2})-([0-9]{2})$", string) is not None
 
 
 def create_query_param(name: str, type_: Type, default) -> pydantic.fields.ModelField:
@@ -29,62 +29,58 @@ def create_query_param(name: str, type_: Type, default) -> pydantic.fields.Model
     )
     return field
 
-
 def dtype_to_type(dtype) -> Type:
     """Convert numpy/pandas dtype to normal Python type."""
     if dtype == np.object:
         return str
     else:
         return type(np.zeros(1, dtype).item())
 
-
 class FastAPI_CSV(FastAPI):
     # TODO: Implement a way to modify auto-generated endpoints, e.g. by
-    #
     # @app.modify("/people")
     # def modify_people(results: List, new_query_param: str = "foo"):
-    #
     #     # `results` are the dicts/json that are normally returned by the endpoint.
     #     # Modify them as you like.
     #     results.append({"Hello": "World"})
-    #
     #     # Any additional function args (like `new_query_param`) are added as p
     #     # arameters to the endpoint, just like in normal fastapi.
     #     results.append({"Hello": new_query_param})
-    #
     #     # You can also do a manual query on the database that's created from the CSV.
     #     rows = app.query_database(f"SELECT * FROM people WHERE first_name={new_query_param}")
     #     results.append(rows)
-    #
     #     # Return modified results so they get passed to the user.
     #     return results
 
     def __init__(self, csv_path: Union[str, Path], delimiter: Union[str, str]) -> None:
         """
         Initializes a FastAPI instance that serves data from a CSV file.
-
         Args:
             csv_path (Union[str, Path]): The path to the CSV file, can also be a URL
         """
         super().__init__()
 
         # Read CSV file to pandas dataframe and create sqlite3 database from it.
         self.csv_path = csv_path
-        self.delimiter = delimiter
-        self.table_name = Path(self.csv_path).stem
+        self.table_name = Path(self.csv_path).stem.replace('-', '_')
         self.con = None
+        self.delimiter = delimiter
         df = self.update_database()
 
         # Add an endpoint for the CSV file with one query parameter for each column.
-        # We hack into fastapi a bit here to inject the query parameters at runtime
-        # based on the column names/types.
-
+        # We hack into fastapi a bit here to inject the query parameters at runtime based on the column names/types.
         # First, define a generic endpoint method, which queries the database.
         def generic_get(**kwargs):
+            selected_cols = []
             where_clauses = []
+            use_distinct = False
             for name, val in kwargs.items():
                 if val is not None:
-                    if name.endswith("_greaterThan"):
+                    if name == "use_distinct":
+                        use_distinct = True
+                    elif name.endswith("_selected"):
+                        selected_cols.append(name[:-9])
+                    elif name.endswith("_greaterThan"):
                         where_clauses.append(f"{name[:-12]}>{val}")
                     elif name.endswith("_greaterThanEqual"):
                         where_clauses.append(f"{name[:-17]}>={val}")
@@ -94,6 +90,14 @@ def generic_get(**kwargs):
                         where_clauses.append(f"{name[:-14]}<={val}")
                     elif name.endswith("_contains"):
                         where_clauses.append(f"instr({name[:-9]}, '{val}') > 0")
+                    elif name.endswith("_like"):
+                        where_clauses.append(f"{name[:-5]} LIKE '{val}'")
+                    elif name.endswith("_regex"):
+                        where_clauses.append(f"{name[:-6]} REGEXP '{val}'")
+                    elif name.endswith("_isBefore"):
+                        where_clauses.append(f"DATE({name[:-9]}) < DATE('{val}')")
+                    elif name.endswith("_isAfter"):
+                        where_clauses.append(f"DATE({name[:-8]}) > DATE('{val}')")
                     else:
                         if isinstance(val, str):
                             val = f"'{val}'"
@@ -103,7 +107,8 @@ def generic_get(**kwargs):
             else:
                 where = ""
 
-            sql_query = f"SELECT * FROM {self.table_name} {where}"
+            selection = ','.join(selected_cols)
+            sql_query = f"SELECT {'DISTINCT' if use_distinct else ''} {selection if len(selected_cols) else '*'} FROM {self.table_name} {where}"
             dicts = self.query_database(sql_query)
             return dicts
 
@@ -114,17 +119,27 @@ def generic_get(**kwargs):
         # Remove all auto-generated query parameters (=one for `kwargs`).
         self._clear_query_params(route_path)
 
+        # Add use_distinct query param
+        self._add_query_param(route_path, 'use_distinct', bool)
+
         # Add new query parameters based on column names and data types.
         for col, dtype in zip(df.columns, df.dtypes):
             type_ = dtype_to_type(dtype)
             self._add_query_param(route_path, col, type_)
+            # Use as a flag to select only given columns
+            self._add_query_param(route_path, col + "_selected", bool)
             if type_ in (int, float):
                 self._add_query_param(route_path, col + "_greaterThan", type_)
                 self._add_query_param(route_path, col + "_greaterThanEqual", type_)
                 self._add_query_param(route_path, col + "_lessThan", type_)
                 self._add_query_param(route_path, col + "_lessThanEqual", type_)
             elif type_ == str:
                 self._add_query_param(route_path, col + "_contains", type_)
+                self._add_query_param(route_path, col + "_like", type_)
+                self._add_query_param(route_path, col + "_regex", type_)
+                if is_date_string(df[col].iloc[df[col].first_valid_index()]):
+                    self._add_query_param(route_path, col + "_isBefore", type_)
+                    self._add_query_param(route_path, col + "_isAfter", type_)
 
     def query_database(self, sql_query):
         """Executes a SQL query on the database and returns rows as list of dicts."""
@@ -135,8 +150,8 @@ def query_database(self, sql_query):
 
     def delete_database(self):
         """
-        Deletes the database with all data read from the CSV.
-
+        Deletes the database with all data read from the CSV. 
+            
         The CSV file is not deleted of course. The API endpoints are also not affected,
         so you can use `update_data` to read in new data.
         """
@@ -150,9 +165,9 @@ def delete_database(self):
     def update_database(self):
         """
         Updates the database with the current data from the CSV file.
-
+        
         Note that this only affects the database, not the endpoints. If the column names
-        and/or data types in the CSV change (and you want that to update in the
+        and/or data types in the CSV change (and you want that to update in the 
         endpoints as well), you need to create a new FastAPI_CSV object.
         """
         self.delete_database()
@@ -165,7 +180,7 @@ def update_database(self):
 
         # Download excel file from Google Sheets, read it with pandas and write to
         # database.
-        df = pd.read_csv(self.csv_path, delimiter=self.delimiter)
+        df = pd.read_csv(self.csv_path, delimiter=self.delimiter, engine ='python')
         self.con = sqlite3.connect(":memory:", check_same_thread=False)
         df.to_sql(self.table_name, self.con)
 
@@ -180,6 +195,13 @@ def dict_factory(cursor, row):
         self.con.row_factory = dict_factory
         logging.info("Database successfully updated")
 
+        # Implement the REGEXP operator in SQLite.
+        def regexp(expr, item):
+            if item is None:
+                return False
+            reg = re.compile(expr)
+            return reg.search(item) is not None
+        self.con.create_function("REGEXP", 2, regexp)
         return df
 
     def _find_route(self, route_path):

diff --git a/images/visual-demo.png b/images/visual-demo.png
diff --git a/people.csv b/people.csv
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
-numpy==1.19.4
-pydantic==1.6.1
-pandas==1.1.3
-uvicorn==0.13.3
-fastapi==0.63.0
-typer==0.3.2
+numpy>=1.19.4
+pydantic>=1.6.1
+pandas>=1.1.3
+uvicorn>=0.13.3
+fastapi>=0.63.0
+typer>=0.3.2