Table of contents
- NASA
- 1. Import libraries
- 2. Ingest data from NASA of (Meteorite Landings)
- 3. Store final dataframe in SQL Server
- 4. Query the data from SQL table
- 5. Save nasa_ml_filt as CSV file
- 6. Read from CSV file
- Conclusion
- Source: Self [Link], [Link]
- Author: Dheeraj. Yss
- Connect with me:
This article will demonstrate the data ingestion and exploration for Meteorite Landings dataset provided by NASA Open Data Portal.
NASA
Open Data Portal
1. Import libraries
import requests
import pandas as pd
import copy
import pyodbc
from sqlalchemy import create_engine, text
from sqlalchemy.engine import URL
from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import OperationalError
import seaborn as sns
import matplotlib.pyplot as plt
import os
# Set the display option to show all max column width
pd.set_option('display.max_colwidth', None)
# Set the display option to show all columns
pd.set_option('display.max_columns', None)
2. Ingest data from NASA of (Meteorite Landings)
- This comprehensive data set from The Meteoritical Society contains information on all of the known meteorite landings.
2.1 Meteorite Landings data
2.1.1 Url Connection setup
# get data from open data endpoints
limit = 100000
nasa_meteorland_url = (
f"https://data.nasa.gov/resource/gh4g-9sfh.json"
f"?$limit={limit}"
)
2.1.2 using Get request to ingest from url:
# Make the HTTP request.
response = requests.get(nasa_meteorland_url)
# Check the status of the request
if response.status_code == 200:
raw_data = response.json()
print("Request was successful.",response.headers['Content-Type'])
else:
print(f"Request failed with status code: {response.status_code}")
- headers
response.headers
- contents
response.content
2.1.3 Exploring json response
type(raw_data), len(raw_data)
raw_data[0]
json_nasa_ml = copy.deepcopy(raw_data)
type(json_nasa_ml), len(json_nasa_ml)
json_nasa_ml[0]
2.1.4 Destructuring json response
destrc_json_nasa_ml = []
for meteor in json_nasa_ml:
destrc_json = {}
for k, v in meteor.items():
if isinstance(v, dict):
nested_values = tuple(v.values())
destrc_json[k] = nested_values
else:
destrc_json[k] = v
destrc_json_nasa_ml.append(destrc_json)
destrc_json_nasa_ml
2.1.5 Use pandas DataFrame to convert the JSON data:
nasa_ml = pd.DataFrame(data=destrc_json_nasa_ml)
nasa_ml.dtypes, nasa_ml.shape
nasa_ml
2.1.6 Transfromations
2.1.6.1 Renaming column
nasa_ml_filt = nasa_ml.rename(columns={'mass': 'mass (g)', 'geolocation': 'GeoLocation'})
2.1.6.2 Check isna() alias isnulls() , empty strings
- Viz ๐
# Count the number of empty string rows for each column
df_var = nasa_ml_filt.isna().sum().to_frame().reset_index(names='Column names').rename(columns={0: 'isna row count'}).merge(nasa_ml_filt.eq('Aggregates').sum().to_frame().reset_index(names='Column names').rename(columns={0: 'Aggregates row count'})).merge(nasa_ml_filt.eq('').sum().to_frame().reset_index(names='Column names').rename(columns={0: 'empty string row count'}))
fig, axs = plt.subplots(1, 1, figsize=(25, 8))
sns.set_theme(style="darkgrid")
sns.barplot(
data=df_var,
x = 'Column names',
y= 'isna row count',
hue= 'isna row count',
palette='Pastel1',
ax=axs
)
# Move the hue legend for axs2 to the right middle
axs.legend(title='isna row count', bbox_to_anchor=(0.10, 0.93), loc='center right')
# Create the second y-axis for Aggregates row count
axs2 = axs.twinx()
sns.barplot(
data=df_var,
x = 'Column names',
y= 'Aggregates row count',
hue= 'Aggregates row count',
palette='coolwarm',
ax=axs2,
)
# Adjust the position of the spines for axs2
axs2.spines['right'].set_position(('outward', 60))
# Create the third y-axis for empty string row count
axs3 = axs.twinx()
sns.lineplot(
data=df_var,
x = 'Column names',
y= 'empty string row count',
ax=axs3,
)
# Adjust the position of the spines for axs3
axs3.spines['right'].set_position(('outward', 120))
# Get the current tick positions and labels
tick_positions = axs.get_xticks()
tick_labels = [label.get_text() for label in axs.get_xticklabels()]
# Set the tick positions and labels with rotation and Rotate x-axis labels by 90 degrees
axs.set_xticks(tick_positions)
axs.set_xticklabels(labels=tick_labels, rotation=90)
plt.title(label='isna(), Aggregates and empty string row count', loc='center')
axs.grid(False)
axs2.grid(False)
axs3.grid(False)
plt.show()
nasa_ml_filt
2.1.6.3 Extract year using str.extract
nasa_ml_filt['year'] = nasa_ml_filt['year'].str.extract(r'(\d{4})')
nasa_ml_filt
Error:
OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 1583-01-01T00:00:00.000, at position 174
OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 1583-01-01T00:00:00.000, at position 125. You might want to try: - passing
format
if your strings have a consistent format; - passingformat='ISO8601'
if your strings are all ISO8601 but not necessarily in exactly the same format; - passingformat='mixed'
, and the format will be inferred for each element individually. You might want to usedayfirst
alongside this.
3. Store final dataframe in SQL Server
3.1 Configuring Database connection setup
3.1.1 Check the driver
pyodbc.drivers()
3.1.2. Configure the connection string
connection_url = URL.create(
"mssql+pyodbc",
username = sql_login_name,
password = sql_login_password,
host = server_name,
port= port_number,
database = database_name,
query = {
"driver": "ODBC Driver 18 for SQL Server",
"TrustServerCertificate": "yes", # When yes, the transport layer will use SSL to encrypt the channel and bypass walking the certificate chain to validate trust. Useful when using self-signed certificates or when the certificate chain cannot be validated.
"authentication": "SqlPassword", # use SQL login credentials instead of Windows authentication.
"pool_size": "1", # to limit the number of sessions to one
},
)
3.1.3. Create an engine using the create_engine() function, specifying the database URL
engine = create_engine(connection_url)
3.1.4 Create a session using sessionmaker
- only run this if you are not using pandas read sql query or to sql i.e if you want to perform DDL or DML oprations:
Session = sessionmaker(bind=engine)
session = Session()
3.2 Read the existing tables in the SQL Server Database
3.2.1 Using Pandas read_sql_query() method - DQL: Select
- first, confirm if the tables already exist in the database
qlist_tables = """
SELECT TOP 10000 *
FROM [dballpurpose].INFORMATION_SCHEMA.TABLES
WHERE TABLE_TYPE IN ('BASE TABLE')
ORDER BY TABLE_NAME ASC
"""
df_var = pd.read_sql_query(qlist_tables,engine)
df_var
table_name = 'Meteorite Landings'
qlist_tables = f"""
SELECT TOP 10000 *
FROM [dballpurpose].INFORMATION_SCHEMA.TABLES
WHERE TABLE_TYPE IN ('BASE TABLE')
AND TABLE_NAME = '{table_name}'
ORDER BY TABLE_NAME ASC
"""
df_var = pd.read_sql_query(qlist_tables,engine)
if df_var.empty:
print(f"Table [{table_name}] does not exist")
else:
print(f"Table [{table_name}] exists")
3.2.2 Using sqlalchemy Session() method - DDL: DROP
- Drop table if n only if exists
Caution: Below is a DROP SQL statement
# Define the DROP SQL statement
table_name = 'Meteorite Landings'
qdrp_table = f"""
IF OBJECT_ID(N'[dbo].[{table_name}]', N'U') IS NOT NULL
BEGIN
DROP TABLE [dballpurpose].[dbo].[{table_name}]
END
"""
try:
# Execute the SQL statement using the session's execute() method
session.execute(text(qdrp_table))
# Commit the changes
session.commit()
print(f"{table_name} dropped successfully!")
except OperationalError as e:
# Handle the OperationalError exception
session.rollback()
print(f"An error occurred: {str(e)}")
finally:
# Close the session
session.close()
3.3 Send the ingested data in dataframes to SQL Server tables
3.3.1 Using Pandas to_sql() method - DDL: Create
Run the below cell only once
table_name = 'Meteorite Landings'
nasa_ml_filt.to_sql(table_name, engine, if_exists='replace', index=False)
4. Query the data from SQL table
- Read from your database Using Pandas read_sql_query() method - DQL: Select
table_name = 'Meteorite Landings'
sql_string = f"""
SELECT TOP 20
*
FROM [dballpurpose].[dbo].[{table_name}]
"""
df_var = pd.read_sql(sql_string, engine)
df_var
- Viz ๐
df_var = df_var.dropna().groupby(['name', 'fall']).agg({'mass (g)': 'mean'})
fig, axs = plt.subplots(1, 1, figsize=(25, 8))
sns.set_theme(style="darkgrid")
sns.barplot(
data=df_var,
x= 'name',
y = 'mass (g)',
hue = 'fall',
palette = 'Pastel1',
ax=axs
)
# Get the current tick positions and labels
tick_positions = axs.get_xticks()
tick_labels = [label.get_text() for label in axs.get_xticklabels()]
# Set the tick positions and labels with rotation and Rotate x-axis labels by 90 degrees
axs.set_xticks(tick_positions)
axs.set_xticklabels(labels=tick_labels, rotation=90)
plt.title(label='Mean Mass of Meteorites by Fall Type', loc='center')
plt.show()
5. Save nasa_ml_filt as CSV file
5.1 Check file if exists
os.getcwd()
# Define the file path
path = r'cc'
file_name = 'Meteorite_Landings'
os.chdir(f'{path}')
csv_files = [file for file in os.listdir() if file.endswith('.csv')]
if any(file_name in file for file in csv_files):
print(f'Found a file with {file_name} in the name.')
else:
print(f'No {file_name} in any file name.')
5.2 Remove file if exists
# Define the file path
path = r'cc'
file_name = 'Meteorite_Landings'
file_path = f"{path}{file_name}.csv"
try:
# Check if the file exists
if os.path.exists(file_path):
# Delete the file
os.remove(file_path)
print(f"File {file_name} deleted successfully!")
else:
print(f"File {file_name} does not exist.")
except Exception as e:
# Handle any exceptions that occur during file deletion
print("An error occurred:", str(e))
5.3 Using pandas to_csv() to save dataframe as CSV file
# Define the file path
path = r'cc'
file_name = 'Meteorite_Landings'
file_path = f"{path}{file_name}.csv"
try:
nasa_ml_filt.to_csv(f'{file_path}', index=False)
print(f'CSV file {file_name}.csv is successfully saved')
except Exception as e:
print(f"An error occurred while saving the CSV file: {e}")
6. Read from CSV file
6.1 Using pandas
# Define the file path
path = r'cc'
file_name = 'Meteorite_Landings'
file_path = f"{path}{file_name}.csv"
try:
df_var = pd.read_csv(filepath_or_buffer=f'{file_path}', delimiter=',')
print(f'CSV file {file_name} is successfully read')
except Exception as e:
print(f"An error occurred while reading the CSV file: {e}")
df_var
Conclusion
Learning Objectives,
Python & Pandas: Import libraries and use Pandas for data manipulation and analysis.
Database Connectivity: Configuring database connections and creating engines with SQLAlchemy in Python.
Data Ingestion: Ingesting data from NYC OpenData using Python requests and Pandas functions.
SQL Operations: Perform CRUD operations and query data from SQL Server using Python.
Data Visualization: Visualize data using Python libraries such as Matplotlib and Seaborn for insightful charts and graphs.