CSV to SQL

17th Aug 2022 research scripts

Python script I wrote in 2022 to convert csv files into DDL SQL instructions, leveraging Pandas and NumPy.

It writes resulting DDL into a sql file with the same name of the csv and in the same directory.

If the primary key is not provided inside the CSV file, the script can generate ID by itself by auto increment.

Sample usage

COL1;COL2;COL3;COLBOOL;AMOUNT
Michele;Salvucci;Pavia;true;13,1
Mario;Monti;Milano;false;65,2
Giuseppe;Bastiano;;true;44

python csv_to_sql.py --table=USERS --idname=ID --idstring="next_val(sequence)" ./test.csv

Output:

INSERT INTO USERS (ID, COL1, COL2, COL3, COLBOOL, AMOUNT) VALUES (next_val(sequence), 'Michele', 'Salvucci', 'Pavia', true, 13.1);
INSERT INTO USERS (ID, COL1, COL2, COL3, COLBOOL, AMOUNT) VALUES (next_val(sequence), 'Mario', 'Monti', 'Milano', false, 65.2);
INSERT INTO USERS (ID, COL1, COL2, COL3, COLBOOL, AMOUNT) VALUES (next_val(sequence), 'Giuseppe', 'Bastiano', NULL, true, 44.0);

python csv_to_sql.py --table=USERS --idname=ID --idauto=true ./test.csv

Output:

INSERT INTO USERS (ID, COL1, COL2, COL3, COLBOOL, AMOUNT) VALUES (0, 'Michele', 'Salvucci', 'Pavia', true, 13.1);
INSERT INTO USERS (ID, COL1, COL2, COL3, COLBOOL, AMOUNT) VALUES (1, 'Mario', 'Monti', 'Milano', false, 65.2);
INSERT INTO USERS (ID, COL1, COL2, COL3, COLBOOL, AMOUNT) VALUES (2, 'Giuseppe', 'Bastiano', NULL, true, 44.0);

Script

#!/usr/bin/env python3.7

"""
Exec example:
python csv_to_sql.py --table=USERS ./test.csv
"""

import pandas as pd
import numpy as np
import argparse
import pprint
import csv
from typing import List

parser = argparse.ArgumentParser(description='Convert CSV to SQL insert')
parser.add_argument('file', type=argparse.FileType('r'))
parser.add_argument('-t', '--table', type=str, required=True, help='The name of the SQL table.')
parser.add_argument('-id', '--idname', type=str, default='ID', help='The SQL table ID column name'+ 
                    'If provided the script will handle ID auto generation following the command below.')
parser.add_argument('-ida', '--idauto', type=bool, default=False, help='The ID will be generate with a sequence from 1 to n.')
parser.add_argument('-ids', '--idstring', 
                    type=str, 
                    default='(SELECT max({0}) + 1 FROM {1})', 
                    help='The SQL string to auto increment the ID value. Provide eventual column and table name' +
                    'by using {0} and {1} placeholders')
parser.add_argument('-s', '--separator', type=str, default=';', help='The CSV separator.')

args = parser.parse_args()

ID_PLACEHOLDER = "@ID@"

def map_value_type(value) -> str:
    if pd.isna(value):
        return 'NULL'
    elif isinstance(value, str):
        if value.startswith(ID_PLACEHOLDER):
            return value.replace(ID_PLACEHOLDER, '')
        else:
            return "'{}'".format(value)
    elif isinstance(value, bool):
        return str(value).lower()
    elif isinstance(value, int) or isinstance(value, float):
        return str(value)

def generate_sql_line(columns: List[str], values: np.array) -> str:    
    mapper_func = np.vectorize(lambda v: map_value_type(v))
    values = mapper_func(values)

    values_str = ", ".join(values)
    return 'INSERT INTO {0} ({1}) VALUES ({2});'.format(args.table, ", ".join(columns), values_str)

def main():
    print("CSV -> SQL")
    print("FILE: ", args.file.name)

    input_df = pd.read_csv(args.file, sep=args.separator, thousands=r'.', decimal=",")

    if args.idname:
        if args.idauto:
            input_df.insert(loc=0, column=args.idname, value=np.arange(input_df.shape[0]))
        else:
            input_df.insert(loc=0, column=args.idname, 
                value='{0}{1}'.format(ID_PLACEHOLDER, args.idstring)
                .format(args.idname, args.table)
            )

    pprint.pprint(input_df)
    np_values = input_df.values

    output_np = np.empty(0, dtype='S')

    for i in range(np_values.shape[0]):
        insert_statement = generate_sql_line(input_df.columns, np_values[i])
        output_np = np.append(output_np, insert_statement)

    pprint.pprint(output_np)
    df = pd.DataFrame(output_np)
    df.to_csv(args.file.name.replace(".csv", ".sql"), sep="\t", 
              quoting=csv.QUOTE_NONE, 
              escapechar="", 
              header=False, index=False
              )

if __name__ == "__main__":
    main()

Previous Post Next Post