Ingest data of World Development Indicators from The World Bank Open Data

Ingest data of World Development Indicators from The World Bank Open Data

·

14 min read

Table of contents

This article will demonstrate the data ingestion and exploration for World Development Indicator datasets like World Bank Country & World Bank CO2 emissions (kt) & World Bank GDP & World Bank Life expectancy provided by The World Bank Open Data.

The World Bank

World Bank Open Data

1. Import libraries

import requests
import pandas as pd
import copy
import pyodbc
from sqlalchemy import create_engine, text
from sqlalchemy.engine import URL
from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import OperationalError
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Set the display option to show all max column width
pd.set_option('display.max_colwidth', None)

# Set the display option to show all columns
pd.set_option('display.max_columns', None)

2. Ingest data from World Bank of (World Development Indicators)

2.1 World Bank Country data

  • Requesting Country Data to list all countries: The following information will appear, when available, in the response when using this country query through the World Bank API:

    • 3 letter ISO 3166-1 alpha-3 code

    • 2 letter ISO 3166-1 alpha-2 code

    • Name

    • Region: ID, name and World Bank 2 letter code

    • Income Level: ID, name and World Bank 2 letter code

    • Lending Type: ID, name and World Bank 2 letter code

    • Capital City

    • Longitude

    • Latitude

2.1.1 Url Connection setup

wbctry_code_url = (
    f"http://api.worldbank.org/v2/country"
    f"?format=json"
    f"&per_page=50000"
)

2.1.2 using Get request to ingest from url:

Run the below cell only once

# Make the HTTP request.
response = requests.get(wbctry_code_url)

# Check the status of the request
if response.status_code == 200:
    raw_data = response.json()
    print("Request was successful.",response.headers['Content-Type'])
else:
    print(f"Request failed with status code: {response.status_code}")
  • headers
response.headers
  • contents
response.content

2.1.3 Exploring json response

type(raw_data), len(raw_data)
raw_data[0]
raw_data[1][0]
json_wbctry = copy.deepcopy(raw_data[1])
type(json_wbctry)
len(json_wbctry)

json_wbctry[0]

2.1.4 Destructuring json response

destrc_json_wbctry = []

for ctry in json_wbctry:

    # Destructuring nested objects and sorting the keys
    destrc_json = {f"{k}_{nested_k}": nested_v.strip() for k, v in ctry.items() if isinstance(v, dict) for nested_k, nested_v in v.items()}
    srt_destrc_json = {k: destrc_json[k] for k in sorted(destrc_json.keys())}

    # Adding the main key to the destructured objects
    srt_destrc_json.update({k: v for k, v in ctry.items() if not isinstance(v, dict)})
    destrc_json_wbctry.append(srt_destrc_json)
destrc_json_wbctry

2.1.5 Use pandas DataFrame to convert the JSON data:

wbctry = pd.DataFrame(data=destrc_json_wbctry)[[
    'id',
    'iso2Code',
    'name',
    'region_id',
    'region_iso2code',
    'region_value',
    'adminregion_id',
    'adminregion_iso2code',
    'adminregion_value',
    'incomeLevel_id',
    'incomeLevel_iso2code',
    'incomeLevel_value',
    'lendingType_id',
    'lendingType_iso2code',
    'lendingType_value',
    'capitalCity',
    'longitude',
    'latitude',
]]
wbctry.dtypes, wbctry.shape

wbctry

2.1.6 Transfromations

2.1.6.1 Renaming column
wbctry_filt = wbctry.rename(columns={'id': 'iso2Code', 'iso2Code': 'id'})
2.1.6.2 Filtering data
wbctry_filt.loc[wbctry_filt['incomeLevel_value'] != 'Aggregates'].reset_index(drop=True, inplace=True)

2.1.6.2 Rearranging

wbctry_filt = wbctry_filt[[
    'id',
    'iso2Code',
    'name',
    'region_id',
    'region_iso2code',
    'region_value',
    'adminregion_id',
    'adminregion_iso2code',
    'adminregion_value',
    'incomeLevel_id',
    'incomeLevel_iso2code',
    'incomeLevel_value',
    'lendingType_id',
    'lendingType_iso2code',
    'lendingType_value',
    'capitalCity',
    'longitude',
    'latitude',
]].copy()


2.2 Finding source Id

2.2.1 Url Connection setup

wbsrc_code_url = (
    f"https://api.worldbank.org/v2/sources"
    f"?format=json"
    f"&per_page=50000"
)

2.2.2 using Get request to ingest from url:

Run the below cell only once

# Make the HTTP request.
response = requests.get(wbsrc_code_url)

# Check the status of the request
if response.status_code == 200:
    raw_data = response.json()
    print("Request was successful.",response.headers['Content-Type'])
else:
    print(f"Request failed with status code: {response.status_code}")
response.headers
response.content

2.2.3 Exploring json response

type(raw_data), len(raw_data)
raw_data[0]
json_wbsrc = copy.deepcopy(raw_data[1])
type(json_wbsrc)
json_wbsrc

2.2.4 Use pandas DataFrame to convert the JSON data:

wbsrc = pd.DataFrame(data=json_wbsrc)
wbsrc.dtypes, wbsrc.shape

wbsrc

2.2.5 Get Source Id for a Database

wdi_srcid = wbsrc.loc[
    wbsrc['name'].str.contains('World Development Indicators', case=False),
    'id'
].to_string(index=False)
wdi_srcid


2.3 World Bank CO2 emissions (kt)

2.3.1 Finding indicator Id

2.3.1.1 Url Connection setup

schkeywrd = 'CO2'

wbindicaco_code_url = (
    f"https://api.worldbank.org/v2"
    f"/sources"
    f"/{wdi_srcid}"
    f"/search"
    f"/{schkeywrd}"
    f"?format=json"
    f"&per_page=20000"
)

2.3.1.2 using Get request to ingest from url:

Run the below cell only once

# Make the HTTP request.
response = requests.get(wbindicaco_code_url)

# Check the status of the request
if response.status_code == 200:
    raw_data = response.json()
    print("Request was successful.",response.headers['Content-Type'])
else:
    print(f"Request failed with status code: {response.status_code}")
response.headers
response.content

2.3.1.3 Exploring json response

type(raw_data), len(raw_data)
raw_data
raw_data.keys()

type(raw_data['source']), len(raw_data['source'])
raw_data['source'][0]['concept'][0]['variable']

json_wbindicaco = copy.deepcopy(raw_data['source'][0]['concept'][0]['variable'])
type(json_wbindicaco)
json_wbindicaco[0]

2.3.1.4 Destructuring json response

destrc_json_wbindicaco = []

for indica in json_wbindicaco:
    destrc_json = {}

    for k, v in indica.items():
        if isinstance(v, list):
            for i, nested_v in enumerate(v):
                for nested_k, nested_val in nested_v.items():
                    destrc_json[f"{k}_{i}_{nested_k}"] = nested_val
        else:
            destrc_json[k] = v

    destrc_json_wbindicaco.append(destrc_json)
destrc_json_wbindicaco

2.3.1.5 Use pandas DataFrame to convert the JSON data:

wbindicaco = pd.DataFrame(data=destrc_json_wbindicaco)
wbindicaco.dtypes, wbindicaco.shape

2.3.1.6 changing the data type

wbindicaco['metatype_1_value'] = wbindicaco['metatype_1_value'].astype(str)
wbindicaco

2.3.1.7 Get indicator Id

coindicaid = wbindicaco.loc[
    wbindicaco['metatype_1_value'].str.contains(r'CO2 emissions \(kt\)', case=False),
    'id'
].to_string(index=False)
coindicaid

2.3.2 CO2 emissions (kt) data

2.3.2.1 Url Connection setup

wbcoemi_code_url = (
    f"http://api.worldbank.org/v2"
    f"/country"
    f"/all"
    f"/indicator"
    f"/{coindicaid}"
    f"?format=json"
    f"&per_page=20000"
)

2.3.2.2 using Get request to ingest from url:

# Make the HTTP request.
response = requests.get(wbcoemi_code_url)

# Check the status of the request
if response.status_code == 200:
    raw_data = response.json()
    print("Request was successful.",response.headers['Content-Type'])
else:
    print(f"Request failed with status code: {response.status_code}")
response.headers
response.content

2.3.2.3 Exploring json response

type(raw_data), len(raw_data)
raw_data[0]
json_wbcoemi = copy.deepcopy(raw_data[1])
type(json_wbcoemi)
json_wbcoemi[0]

2.3.2.4 Destructuring json response

destrc_json_wbcoemi = []

for coemi in json_wbcoemi:
    destrc_json = {}

    for k, v in coemi.items():
        if isinstance(v, dict):
            for nested_k, nested_v in v.items():
                destrc_json[f'{k}_{nested_k}'] = nested_v
        else:
            destrc_json[k] = v
    destrc_json_wbcoemi.append(destrc_json)
destrc_json_wbcoemi

2.3.2.5 Use pandas DataFrame to convert the JSON data:

wbcoemi = pd.DataFrame(data=destrc_json_wbcoemi)
wbcoemi.dtypes, wbcoemi.shape
wbcoemi

2.3.2.6 Transformations

2.3.2.6.1 Renaming column
wbcoemi_trnsfmtn = wbcoemi.rename(columns={'date': 'year'}).add_prefix('co2_') #, 'value': 'co2_value'
2.3.2.6.2 Sorting
wbcoemi_trnsfmtn.sort_values(['co2_country_value', 'co2_year'], ascending=[True, True], inplace=True)
wbcoemi_trnsfmtn.reset_index(drop=True, inplace=True)

2.4 World Bank GDP data

2.4.1 Finding indicator Id

2.4.1.1 Url Connection setup

schkeywrd = 'GDP'

wbindicagdp_code_url = (
    f"https://api.worldbank.org/v2"
    f"/sources"
    f"/{wdi_srcid}"
    f"/search"
    f"/{schkeywrd}"
    f"?format=json"
    f"&per_page=20000"
)

2.4.1.2 using Get request to ingest from url:

Run the below cell only once

# Make the HTTP request.
response = requests.get(wbindicagdp_code_url)

# Check the status of the request
if response.status_code == 200:
    raw_data = response.json()
    print("Request was successful.",response.headers['Content-Type'])
else:
    print(f"Request failed with status code: {response.status_code}")
response.headers
response.content

2.4.1.3 Exploring json response

type(raw_data), len(raw_data)
raw_data
raw_data.keys()
type(raw_data['source']), len(raw_data['source'])

raw_data['source'][0]['concept'][0]['variable']
json_wbindicagdp = copy.deepcopy(raw_data['source'][0]['concept'][0]['variable'])
type(json_wbindicagdp)
json_wbindicagdp[0]

2.4.1.4 Destructuring json response

destrc_json_wbindicagdp = []

for indica in json_wbindicagdp:
    destrc_json = {}

    for k, v in indica.items():
        if isinstance(v, list):
            for i, nested_v in enumerate(v):
                for nested_k, nested_val in nested_v.items():
                    destrc_json[f"{k}_{i}_{nested_k}"] = nested_val
        else:
            destrc_json[k] = v

    destrc_json_wbindicagdp.append(destrc_json)

2.4.1.5 Use pandas DataFrame to convert the JSON data:

wbindicagdp = pd.DataFrame(data=destrc_json_wbindicagdp)
wbindicagdp.dtypes, wbindicagdp.shape

2.4.1.6 changing the data type

wbindicagdp['metatype_0_value'] = wbindicagdp['metatype_0_value'].astype(str)
wbindicagdp

2.4.1.7 Get indicator Id

gdpindicaid = wbindicagdp.loc[
    wbindicagdp['metatype_0_value'].str.contains(r'GDP \(current US\$\)', case=False),
    'id'
].to_string(index=False)
gdpindicaid

2.4.2 GDP per capita (current US$) data

2.4.2.1 Url Connection setup

wbgdp_code_url = (
    f"http://api.worldbank.org/v2"
    f"/country"
    f"/all"
    f"/indicator"
    f"/{gdpindicaid}"
    f"?format=json"
    f"&per_page=20000"
)

2.4.2.2 using Get request to ingest from url:

Run the below cell only once

# Make the HTTP request.
response = requests.get(wbgdp_code_url)

# Check the status of the request
if response.status_code == 200:
    raw_data = response.json()
    print("Request was successful.",response.headers['Content-Type'])
else:
    print(f"Request failed with status code: {response.status_code}")
response.headers
response.content

2.4.2.3 Exploring json response

type(raw_data), len(raw_data)
raw_data[0]
json_wbgdp = copy.deepcopy(raw_data[1])
type(json_wbgdp)
json_wbgdp[0]

2.4.2.4 Destructuring json response

destrc_json_wbgdp = []

for gdp in json_wbgdp:
    destrc_json = {}

    for k, v in gdp.items():
        if isinstance(v, dict):
            for nested_k, nested_v in v.items():
                destrc_json[f'{k}_{nested_k}'] = nested_v
        else:
            destrc_json[k] = v
    destrc_json_wbgdp.append(destrc_json)
destrc_json_wbgdp

2.4.2.5 Use pandas DataFrame to convert the JSON data:

wbgdp = pd.DataFrame(data=destrc_json_wbgdp)
wbgdp.dtypes, wbgdp.shape
wbgdp

2.4.2.6 Transformations

2.4.2.6.1 Renaming column
wbgdp_trnsfmtn = wbgdp.rename(columns={'date': 'year'}).add_prefix('gdp_')#, 'value': 'gdp_value'
2.4.2.6.2 Sorting
wbgdp_trnsfmtn.sort_values(['gdp_country_value', 'gdp_year'], ascending=[True, True], inplace=True)
wbgdp_trnsfmtn.reset_index(drop=True, inplace=True)

2.5 World Bank Life expectancy data

2.5.1 Finding indicator Id

2.5.1.1 Url Connection setup

schkeywrd = 'Life expectancy at birth'

wbindicale_code_url = (
    f"https://api.worldbank.org/v2"
    f"/sources"
    f"/{wdi_srcid}"
    f"/search"
    f"/{schkeywrd}"
    f"?format=json"
    f"&per_page=20000"
)

2.5.1.2 using Get request to ingest from url:

Run the below cell only once

# Make the HTTP request.
response = requests.get(wbindicale_code_url)

# Check the status of the request
if response.status_code == 200:
    raw_data = response.json()
    print("Request was successful.",response.headers['Content-Type'])
else:
    print(f"Request failed with status code: {response.status_code}")
response.headers
response.content

2.5.1.3 Exploring json response

type(raw_data), len(raw_data)
raw_data
raw_data.keys()
type(raw_data['source']), len(raw_data['source'])

raw_data['source'][0]['concept'][0]['variable']
json_wbindicale = copy.deepcopy(raw_data['source'][0]['concept'][0]['variable'])
type(json_wbindicale)
json_wbindicale[0]

2.5.1.4 Destructuring json response

destrc_json_wbindicale = []

for indica in json_wbindicale:
    destrc_json = {}

    for k, v in indica.items():
        if isinstance(v, list):
            for i, nested_v in enumerate(v):
                for nested_k, nested_val in nested_v.items():
                    destrc_json[f"{k}_{i}_{nested_k}"] = nested_val
        else:
            destrc_json[k] = v

    destrc_json_wbindicale.append(destrc_json)
destrc_json_wbindicale

2.5.1.5 Use pandas DataFrame to convert the JSON data:

wbindicale = pd.DataFrame(data=destrc_json_wbindicale)
wbindicale.dtypes, wbindicale.shape

2.5.1.6 changing the data type

wbindicale['metatype_1_value'] = wbindicale['metatype_1_value'].astype(str)
wbindicale

2.5.1.7 Get indicator Id

leindicaid = wbindicale.loc[
    wbindicale['metatype_1_value'].str.contains(r'Life expectancy at birth, total \(years\)', case=False),
    'id'
].to_string(index=False)
leindicaid

2.5.2 Life expectancy at birth, total (years) data

2.5.2.1 Url Connection setup

wble_code_url = (
    f"http://api.worldbank.org/v2"
    f"/country"
    f"/all"
    f"/indicator"
    f"/{leindicaid}"
    f"?format=json"
    f"&per_page=20000"
)

2.5.2.2 using Get request to ingest from url:

# Make the HTTP request.
response = requests.get(wble_code_url)

# Check the status of the request
if response.status_code == 200:
    raw_data = response.json()
    print("Request was successful.",response.headers['Content-Type'])
else:
    print(f"Request failed with status code: {response.status_code}")
response.headers
response.content

2.5.2.3 Exploring json response

type(raw_data), len(raw_data)
raw_data[0]
json_wble = copy.deepcopy(raw_data[1])
type(json_wble)
json_wble[0]

2.5.2.4 Destructuring json response

destrc_json_wble = []

for le in json_wble:
    destrc_json = {}

    for k, v in le.items():
        if isinstance(v, dict):
            for nested_k, nested_v in v.items():
                destrc_json[f'{k}_{nested_k}'] = nested_v
        else:
            destrc_json[k] = v
    destrc_json_wble.append(destrc_json)
destrc_json_wble

2.5.2.5 Use pandas DataFrame to convert the JSON data:

wble = pd.DataFrame(data=destrc_json_wble)
wble.dtypes, wble.shape
wble

2.5.2.6 Transformations

2.5.2.6.1 Renaming column
wble_trnsfmtn = wble.rename(columns={'date': 'year'}).add_prefix('life_exp_')#, 'value': 'life_exp_value'
2.5.2.6.2 Sorting
wble_trnsfmtn.sort_values(['life_exp_country_value', 'life_exp_year'], ascending=[True, True], inplace=True)
wble_trnsfmtn.reset_index(drop=True, inplace=True)


3. Merge all Dataframes

3.1 Countries dataframe

wbctry_filt = wbctry_filt[['iso2Code', 'name', 'region_value', 'incomeLevel_value']]

wbctry_filt.dtypes

3.2 CO2 emissions dataframe

wbcoemi_trnsfmtn = wbcoemi_trnsfmtn[['co2_countryiso3code', 'co2_year', 'co2_value']]

wbcoemi_trnsfmtn.dtypes

3.3 GDP dataframe

wbgdp_trnsfmtn = wbgdp_trnsfmtn[['gdp_countryiso3code', 'gdp_year', 'gdp_value']]

wbgdp_trnsfmtn.dtypes

3.4 Life expectancy dataframe

wble_trnsfmtn = wble_trnsfmtn[['life_exp_countryiso3code', 'life_exp_year', 'life_exp_value']]

wble_trnsfmtn.dtypes

3.5 Merging

wb_df = wbctry_filt.merge(wbcoemi_trnsfmtn, left_on=["iso2Code"], right_on=["co2_countryiso3code"], how="right")[['iso2Code','name', 'region_value', 'incomeLevel_value', 'co2_year', 'co2_value']]
wb_df = wb_df.merge(wbgdp_trnsfmtn, left_on=["iso2Code","co2_year"], right_on=["gdp_countryiso3code", "gdp_year"], how="right")[['iso2Code','name', 'region_value', 'incomeLevel_value', 'co2_year', 'co2_value', 'gdp_value']]
wb_df = wb_df.merge(wble_trnsfmtn, left_on=["iso2Code","co2_year"], right_on=["life_exp_countryiso3code", "life_exp_year"], how="right")[['iso2Code','name', 'region_value', 'incomeLevel_value', 'co2_year', 'co2_value', 'gdp_value', 'life_exp_value']]

3.6 Renaming column names

wb_df.rename(columns={'iso2Code': 'country_code','name': 'country_name', 'region_value': 'region', 'incomeLevel_value': 'income_group', 'co2_year': 'year', 'co2_value': 'co2', 'gdp_value': 'gdp', 'life_exp_value': 'life_exp'}, inplace=True)

3.7 Rearranging columns

wb_df = wb_df[[
    'country_name',
    'country_code',
    'year',
    'co2',
    'gdp',
    'life_exp',
    'income_group',
    'region',
]].copy()
wb_df

4. Store final dataframe in SQL Server

4.1 Configuring Database connection setup

4.1.1 Check the driver

pyodbc.drivers()

4.1.2. Configure the connection string

connection_url = URL.create(
    "mssql+pyodbc",
    username = sql_login_name,
    password = sql_login_password,
    host = server_name,
    port= port_number,
    database = database_name,
    query = {
        "driver": "ODBC Driver 18 for SQL Server",
         "TrustServerCertificate": "yes", # When yes, the transport layer will use SSL to encrypt the channel and bypass walking the certificate chain to validate trust. Useful when using self-signed certificates or when the certificate chain cannot be validated.
        "authentication": "SqlPassword", # use SQL login credentials instead of Windows authentication.
        "pool_size": "1", # to limit the number of sessions to one
    },
)

4.1.3. Create an engine using the create_engine() function, specifying the database URL

engine = create_engine(connection_url)

4.1.4 Create a session using sessionmaker

  • only run this if you are not using pandas read sql query or to sql i.e if you want to perform DDL or DML oprations:
Session = sessionmaker(bind=engine)
session = Session()

4.2 Read the existing tables in the SQL Server Database

4.2.1 Using Pandas read_sql_query() method - DQL: Select

  • first, confirm if the tables already exist in the database
qlist_tables = """
    SELECT TOP 10000 *
    FROM [dballpurpose].INFORMATION_SCHEMA.TABLES
    WHERE TABLE_TYPE IN ('BASE TABLE')
    ORDER BY TABLE_NAME ASC
"""

df_var = pd.read_sql_query(qlist_tables,engine)
df_var
table_name = 'World bank tidy'

qlist_tables = f"""
    SELECT TOP 10000 *
    FROM [dballpurpose].INFORMATION_SCHEMA.TABLES
    WHERE TABLE_TYPE IN ('BASE TABLE')
        AND TABLE_NAME = '{table_name}'
    ORDER BY TABLE_NAME ASC
"""

df_var = pd.read_sql_query(qlist_tables,engine)

if df_var.empty:
    print(f"Table [{table_name}] does not exist")
else:
    print(f"Table [{table_name}] exists")

4.2.2 Using sqlalchemy Session() method - DDL: DROP

  • Drop table if n only if exists

Caution: Below is a DROP SQL statement

# Define the DROP SQL statement

table_name = 'World bank tidy'
qdrp_table = f"""
IF OBJECT_ID(N'[dbo].[{table_name}]', N'U') IS NOT NULL
BEGIN
    DROP TABLE [dballpurpose].[dbo].[{table_name}]
END
"""

try:
    # Execute the SQL statement using the session's execute() method
    session.execute(text(qdrp_table))

    # Commit the changes
    session.commit()
    print(f"{table_name} dropped successfully!")
except OperationalError as e:
    # Handle the OperationalError exception
    session.rollback()
    print(f"An error occurred: {str(e)}")
finally:
    # Close the session
    session.close()

4.3 Send the ingested data in dataframes to SQL Server tables

4.3.1 Using Pandas to_sql() method - DDL: Create

Run the below cell only once

table_name = 'World bank tidy'

wb_df.to_sql(table_name, engine, if_exists='replace', index=False)

5. Query the data from SQL table

  • Read from your database Using Pandas read_sql_query() method - DQL: Select
sql_string = """
  SELECT 
    *
  FROM [dballpurpose].[dbo].[World bank tidy]
"""

df_var = pd.read_sql(sql_string, engine)
df_var

  • Viz 📉
df_var = df_var.dropna().groupby(['year', 'income_group']).agg({'co2': 'sum', 'life_exp': 'mean'})

fig, axs = plt.subplots(1, 1, figsize=(25, 8))

sns.set_theme(style="darkgrid")

sns.barplot(
    data=df_var,
    x= 'year',
    y = 'co2',
    hue = 'income_group',
    palette = 'Pastel1',
    ax=axs
)

# Get the current tick positions and labels
tick_positions = axs.get_xticks()
tick_labels = [label.get_text() for label in axs.get_xticklabels()]

# Set the tick positions and labels with rotation and Rotate x-axis labels by 90 degrees
axs.set_xticks(tick_positions)
axs.set_xticklabels(labels=tick_labels, rotation=90)

# Get the current tick positions and labels
tick_positions = axs.get_yticks()
tick_labels = [f'{int(tick/1e6)} PPM' for tick in tick_positions]  # Custom labels in parts per millions

# Set the tick positions and labels
axs.set_yticks(tick_positions)
axs.set_yticklabels(labels=tick_labels)

plt.title(label='Worlds CO2 emission by year', loc='center')

plt.ylabel('CO2 Emissions (PPM)')

plt.show()

6. Save wb_df as CSV file

6.1 Check file if exists

os.getcwd()
# Define the file path
path = r'nnnn'
file_name = 'wb_tidy'

os.chdir(f'{path}')

csv_files = [file for file in os.listdir() if file.endswith('.csv')]

if any(file_name in file for file in csv_files):
    print(f'Found a file with {file_name} in the name.')
else:
    print(f'No {file_name} in any file name.')

6.2 Remove file if exists

# Define the file path
path = r'nnnn'
file_name = 'wb_tidy'
file_path = f"{path}{file_name}.csv"

try:
    # Check if the file exists
    if os.path.exists(file_path):
        # Delete the file
        os.remove(file_path)
        print(f"File {file_name} deleted successfully!")
    else:
        print(f"File {file_name} does not exist.")
except Exception as e:
    # Handle any exceptions that occur during file deletion
    print("An error occurred:", str(e))

6.3 Using pandas to_csv() to save dataframe as CSV file

Run the below cell only once

# Define the file path
path = r'nnnn'
file_name = 'wb_tidy'
file_path = f"{path}{file_name}.csv"

try:
    wb_df.to_csv(f'{file_path}', index=False)
    print(f'CSV file {file_name}.csv is successfully saved')
except Exception as e:
    print(f"An error occurred while saving the CSV file: {e}")

7. Read from CSV file

7.1 Using pandas

# Define the file path
path = r'C:\Users\HP\Downloads\\'
file_name = 'wb_tidy'
file_path = f"{path}{file_name}.csv"

try:
    df_var = pd.read_csv(filepath_or_buffer=f'{file_path}', delimiter=',')
    print(f'CSV file {file_name} is successfully read')
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")
df_var

Conclusion

Learning Objectives,

  • Python & Pandas: Import libraries and use Pandas for data manipulation and analysis.

  • Database Connectivity: Configuring database connections and creating engines with SQLAlchemy in Python.

  • Data Ingestion: Ingesting data from NYC OpenData using Python requests and Pandas functions.

  • SQL Operations: Perform CRUD operations and query data from SQL Server using Python.

  • Data Visualization: Visualize data using Python libraries such as Matplotlib and Seaborn for insightful charts and graphs.

Source: Self [Link], [Link]

Author: Dheeraj. Yss

Connect with me:

Did you find this article valuable?

Support dheerajy blog by becoming a sponsor. Any amount is appreciated!