spacex__sql_week2ipynb.py

# -*- coding: utf-8 -*-
"""SpaceX._SQL_week2ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1njUH4yaaSvTAQKCiMMKh4nEvJ2gZfrmd

# Connect to the database
Let us first load the SQL extension and establish a connection with the database
"""

import pandas as pd
df = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/labs/module_2/data/Spacex.csv")
#df.to_sql("SPACEXTBL", con, if_exists='replace', index=False,method="multi")

df.info()

df.head(3)

"""## Tasks

Now write and execute SQL queries to solve the assignment tasks.

**Note: If the column names are in mixed case enclose it in double quotes
For Example "Landing_Outcome"**

### Task 1

##### Display the names of the unique launch sites  in the space mission
"""

set(df['Launch_Site'])  #Utilizing the set function to extract unique values from a column containing launch site names. The set function eliminates duplicates and presents the remaining distinct values as the output.

"""### Task 2

##### Display 5 records where launch sites begin with the string 'CCA'
In data science, the rows of a table or dataset are called records, and the columns are called fields or columns. These terms are typically used to describe the structure of tabular data, such as databases, spreadsheets, and CSV files.
"""

df_CCA01= df[df['Launch_Site']=='CCAFS LC-40']
#df[df['Launch_Site'].str.startswith('CCAFS')]: This part filters the DataFrame df to only include rows where the value in the Launch_Site column starts with the string "CCAFS".

"""The query method allows you to filter the DataFrame based on a boolean expression. Here, Launch_Site.str.contains('CCA') checks for the presence of "CCA" in the Launch_Site column.
This method iterates through each row of the DataFrame using a list comprehension. It checks if "CCA" exists within the Launch_Site column and adds matching rows to a new list. You can optionally convert the list back to a DataFrame using its columns.

These methods achieve the same result as the original code, but they provide different approaches for filtering the DataFrame. Choose the method that best suits your coding style and readability preferences.
df_CCA= df[df['Launch_Site'].str.contains('CCA')]
"""

df_CCA.head()

df_CCA= df[df['Launch_Site'].str.contains('CCA')]

df_CCA

set(df_CCA['Launch_Site'])  #"The records identified by the filtering criteria all belong to the same launch site."

df_CCA.head()

"""### Task 3

##### Display the total payload mass carried by boosters launched by NASA (CRS)
Task 3 asks to show me the payload mass and booster versions.

"""

df.info()

df['Customer'] == 'NASA (CRS)'

#The query is asking for a way to display the PAYLOAD_MASS__KG_ and Booster_Version columns from the df DataFrame. The provided code snippet achieves this by:
#Creating a new DataFrame: A new DataFrame named df_Mass is created by selecting only the PAYLOAD_MASS__KG_ and Booster_Version columns from the original df DataFrame. This extracts the specific columns of interest.
#Displaying the new DataFrame: The print() statement displays the contents of the df_Mass DataFrame. This presents the extracted information in a tabular format.

df_Mass=df[['PAYLOAD_MASS__KG_', 'Booster_Version', 'Customer']]
df_Mass

NASA_CRS = df[df['Customer'] == 'NASA (CRS)']

Total_PayloadMass = NASA_CRS['PAYLOAD_MASS__KG_'].sum()
Total_PayloadMass

"""### Task 4

##### Display average payload mass carried by booster version F9 v1.1
"""

#Use the set function to find the distinct set of values in the Booster_Version column.
#This will print a set of strings, each representing a unique booster version.
print(set(df_Mass['Booster_Version']))
#Output Format Differences: set vs. print
#set: Returns a collection of values as a data structure, which can include lists, dictionaries, or other types. This data structure can be used for further processing within the program.
#print: Displays values as a simple string to the output (e.g., console or file). This string can include text, numbers, or a combination of both

#Filter the df_Mass dataframe for data pertaining to booster version F9 v1.1.
# Filter data for booster version F9 v1.1
df_F9= df_Mass[df_Mass['Booster_Version'].str.contains("F9 v1.1")]

# Calculate average payload mass
# Use the mean() function to calculate the average payload mass.
df_F9['PAYLOAD_MASS__KG_'].mean()
#This code displays the average payload mass carried by the F9 v1.1 booster version.
#This helps you understand how much payload this booster version can carry on average.

df_F9['PAYLOAD_MASS__KG_'].sum()

df['PAYLOAD_MASS__KG_'].sum()

df_F9  #Show all booster_version and payload_mass_kg that start with the phrase F9 V1.1.

"""### Task 5

##### List the date when the first succesful landing outcome in ground pad was acheived.

"""

#Task 5 says to tell the date of the first successful LANDING OUTCOME.
df_Date=df[['Date', 'Landing_Outcome']]

df_Date

df_date= df_Date[df_Date['Landing_Outcome'] ==  'Success (ground pad)']
#df_date= df_Date[df_Date['Landing_Outcome']=='Success']: This line filters the df_Date DataFrame to include only rows where the value in the 'Landing_Outcome' column is equal to 'Success'. The result is stored in a new DataFrame named df_date.

df_date

#The minimum date among successful landings in your data can be found using the min() method after filtering for successful outcomes
#Find minimum date: The min() method is applied to the 'Date' column of df_successful_landings to find the earliest date among successful landings
min_data = df_Date['Date'].min()
print(f"Date when the first successful landing outcome in ground pad was achieved: {min_data}")

df_success_land = df_Date[df_Date['Landing_Outcome']=='Success']  #"An alternative approach suggested by the project itself"

df_success_land["Date"] = df_success_land["Date"].astype('datetime64[ns]') #"An alternative approach suggested by the project itself"

df_date = df_success_land.sort_values(by='Date') #"An alternative approach suggested by the project itself"

df_date['Date'].min()  #"An alternative approach suggested by the project itself"

"""### Task 6

##### List the names of the boosters which have success in drone ship and have payload mass greater than 4000 but less than 6000
"""

set(df['Landing_Outcome'])

df_booster = df[['Landing_Outcome','Booster_Version', 'PAYLOAD_MASS__KG_']]  #Filter the data to only show successful drone ship landings
df_booster

#Select only rows with payload mass between 4000 and 6000 kilograms (df['PAYLOAD_MASS__KG_'] > 4000 and df['PAYLOAD_MASS__KG_'] < 6000).
#Select only rows with successful landings on drone ships (df['Landing_Outcome']=='Success (drone ship)')).
df_boost= df.loc[(df['PAYLOAD_MASS__KG_'] > 4000) & (df['PAYLOAD_MASS__KG_'] < 6000) & (df['Landing_Outcome']=='Success (drone ship)')]

set(df_boost['Booster_Version'])  #dataframe contains only successful drone ship landings with payload masses between 4000 and 6000 kilograms.

"""### Task 7

##### List the total number of successful and failure mission outcomes
"""

set(df['Mission_Outcome'])  #The set() function takes the Mission_Outcome column as input and generates a set containing all distinct values found in that column. This set represents the unique mission outcomes present in your data.

df['Mission_Outcome'].value_counts()  # 98 success
#The value_counts() method directly operates on the Mission_Outcome column. It counts the occurrences of each unique value in the column and returns a Series (a one-dimensional Pandas data structure) where the index represents the unique outcomes and the values represent their respective counts.

"""### Task 8

##### List the   names of the booster_versions which have carried the maximum payload mass. Use a subquery
"""

#Determining the Maximum Payload Mass
#The max() function is applied to the PAYLOAD_MASS__KG_ column in the df dataframe. This finds the maximum value (i.e., the heaviest payload mass) among all the payloads in the dataset.
#
max_payloadmass= df['PAYLOAD_MASS__KG_'].max()

max_payloadmass

## Filter data for booster_version with payload mass of 15600 kg.
df_boost_maxmass= df[df['PAYLOAD_MASS__KG_']==max_payloadmass]  #

## Print all booster versions in the filtered data
print(set(df_boost_maxmass['Booster_Version']))

"""### Task 9

##### List the records which will display the month names, failure landing_outcomes in drone ship ,booster versions, launch_site for the months in year 2015.

**Note: SQLLite does not support monthnames. So you need to use  substr(Date, 4, 2) as month to get the months and substr(Date,7,4)='2015' for year.**
"""

import datetime

# Select relevant columns
df_9=df[['Date', 'Landing_Outcome','Booster_Version','Launch_Site']]

df_9

#Extract year and month and day(as string)
df_9['Date'] = pd.to_datetime(df_9['Date'], format='%Y-%m-%d')

#Filter for year 2015 and failed drone ship landings
df_2015=df_9[df_9['Date'].dt.strftime('%Y')=='2015']
df_2015

df_month_name= df_2015[df_2015['Landing_Outcome']=='Failure (drone ship)']  #sprate landing_outcome equal failure (drone ship)

df_month_name

"""### Task 10

##### Rank the  count of  successful landing_outcomes between the date 04-06-2010 and 20-03-2017 in descending order.
"""

#Rank the count of successful landing_outcomes between the date 04-06-2010 and 20-03-2017 in descending order.
#Filter data:
#The code correctly filters the dataframe (df) based on the specified date range (2010-06-04 to 2017-03-20) and isolates rows with the "Success" outcome in the 'Landing_Outcome' column
df_success_outcome= df.loc[(df['Date'] > '2010-06-04') & (df['Date'] < '2017-03-20') & df['Landing_Outcome'].str.contains("Success")]
df_success_outcome

df_success_outcome['Landing_Outcome'].count()  #alue_counts() method to the 'Landing_Outcome' column of the filtered dataframe.

len(df_success_outcome['Landing_Outcome'])