-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspacex__sql_week2ipynb.py
223 lines (141 loc) · 9.74 KB
/
spacex__sql_week2ipynb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# -*- coding: utf-8 -*-
"""SpaceX._SQL_week2ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1njUH4yaaSvTAQKCiMMKh4nEvJ2gZfrmd
# Connect to the database
Let us first load the SQL extension and establish a connection with the database
"""
import pandas as pd
df = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/labs/module_2/data/Spacex.csv")
#df.to_sql("SPACEXTBL", con, if_exists='replace', index=False,method="multi")
df.info()
df.head(3)
"""## Tasks
Now write and execute SQL queries to solve the assignment tasks.
**Note: If the column names are in mixed case enclose it in double quotes
For Example "Landing_Outcome"**
### Task 1
##### Display the names of the unique launch sites in the space mission
"""
set(df['Launch_Site']) #Utilizing the set function to extract unique values from a column containing launch site names. The set function eliminates duplicates and presents the remaining distinct values as the output.
"""### Task 2
##### Display 5 records where launch sites begin with the string 'CCA'
In data science, the rows of a table or dataset are called records, and the columns are called fields or columns. These terms are typically used to describe the structure of tabular data, such as databases, spreadsheets, and CSV files.
"""
df_CCA01= df[df['Launch_Site']=='CCAFS LC-40']
#df[df['Launch_Site'].str.startswith('CCAFS')]: This part filters the DataFrame df to only include rows where the value in the Launch_Site column starts with the string "CCAFS".
"""The query method allows you to filter the DataFrame based on a boolean expression. Here, Launch_Site.str.contains('CCA') checks for the presence of "CCA" in the Launch_Site column.
This method iterates through each row of the DataFrame using a list comprehension. It checks if "CCA" exists within the Launch_Site column and adds matching rows to a new list. You can optionally convert the list back to a DataFrame using its columns.
These methods achieve the same result as the original code, but they provide different approaches for filtering the DataFrame. Choose the method that best suits your coding style and readability preferences.
df_CCA= df[df['Launch_Site'].str.contains('CCA')]
"""
df_CCA.head()
df_CCA= df[df['Launch_Site'].str.contains('CCA')]
df_CCA
set(df_CCA['Launch_Site']) #"The records identified by the filtering criteria all belong to the same launch site."
df_CCA.head()
"""### Task 3
##### Display the total payload mass carried by boosters launched by NASA (CRS)
Task 3 asks to show me the payload mass and booster versions.
"""
df.info()
df['Customer'] == 'NASA (CRS)'
#The query is asking for a way to display the PAYLOAD_MASS__KG_ and Booster_Version columns from the df DataFrame. The provided code snippet achieves this by:
#Creating a new DataFrame: A new DataFrame named df_Mass is created by selecting only the PAYLOAD_MASS__KG_ and Booster_Version columns from the original df DataFrame. This extracts the specific columns of interest.
#Displaying the new DataFrame: The print() statement displays the contents of the df_Mass DataFrame. This presents the extracted information in a tabular format.
df_Mass=df[['PAYLOAD_MASS__KG_', 'Booster_Version', 'Customer']]
df_Mass
NASA_CRS = df[df['Customer'] == 'NASA (CRS)']
Total_PayloadMass = NASA_CRS['PAYLOAD_MASS__KG_'].sum()
Total_PayloadMass
"""### Task 4
##### Display average payload mass carried by booster version F9 v1.1
"""
#Use the set function to find the distinct set of values in the Booster_Version column.
#This will print a set of strings, each representing a unique booster version.
print(set(df_Mass['Booster_Version']))
#Output Format Differences: set vs. print
#set: Returns a collection of values as a data structure, which can include lists, dictionaries, or other types. This data structure can be used for further processing within the program.
#print: Displays values as a simple string to the output (e.g., console or file). This string can include text, numbers, or a combination of both
#Filter the df_Mass dataframe for data pertaining to booster version F9 v1.1.
# Filter data for booster version F9 v1.1
df_F9= df_Mass[df_Mass['Booster_Version'].str.contains("F9 v1.1")]
# Calculate average payload mass
# Use the mean() function to calculate the average payload mass.
df_F9['PAYLOAD_MASS__KG_'].mean()
#This code displays the average payload mass carried by the F9 v1.1 booster version.
#This helps you understand how much payload this booster version can carry on average.
df_F9['PAYLOAD_MASS__KG_'].sum()
df['PAYLOAD_MASS__KG_'].sum()
df_F9 #Show all booster_version and payload_mass_kg that start with the phrase F9 V1.1.
"""### Task 5
##### List the date when the first succesful landing outcome in ground pad was acheived.
"""
#Task 5 says to tell the date of the first successful LANDING OUTCOME.
df_Date=df[['Date', 'Landing_Outcome']]
df_Date
df_date= df_Date[df_Date['Landing_Outcome'] == 'Success (ground pad)']
#df_date= df_Date[df_Date['Landing_Outcome']=='Success']: This line filters the df_Date DataFrame to include only rows where the value in the 'Landing_Outcome' column is equal to 'Success'. The result is stored in a new DataFrame named df_date.
df_date
#The minimum date among successful landings in your data can be found using the min() method after filtering for successful outcomes
#Find minimum date: The min() method is applied to the 'Date' column of df_successful_landings to find the earliest date among successful landings
min_data = df_Date['Date'].min()
print(f"Date when the first successful landing outcome in ground pad was achieved: {min_data}")
df_success_land = df_Date[df_Date['Landing_Outcome']=='Success'] #"An alternative approach suggested by the project itself"
df_success_land["Date"] = df_success_land["Date"].astype('datetime64[ns]') #"An alternative approach suggested by the project itself"
df_date = df_success_land.sort_values(by='Date') #"An alternative approach suggested by the project itself"
df_date['Date'].min() #"An alternative approach suggested by the project itself"
"""### Task 6
##### List the names of the boosters which have success in drone ship and have payload mass greater than 4000 but less than 6000
"""
set(df['Landing_Outcome'])
df_booster = df[['Landing_Outcome','Booster_Version', 'PAYLOAD_MASS__KG_']] #Filter the data to only show successful drone ship landings
df_booster
#Select only rows with payload mass between 4000 and 6000 kilograms (df['PAYLOAD_MASS__KG_'] > 4000 and df['PAYLOAD_MASS__KG_'] < 6000).
#Select only rows with successful landings on drone ships (df['Landing_Outcome']=='Success (drone ship)')).
df_boost= df.loc[(df['PAYLOAD_MASS__KG_'] > 4000) & (df['PAYLOAD_MASS__KG_'] < 6000) & (df['Landing_Outcome']=='Success (drone ship)')]
set(df_boost['Booster_Version']) #dataframe contains only successful drone ship landings with payload masses between 4000 and 6000 kilograms.
"""### Task 7
##### List the total number of successful and failure mission outcomes
"""
set(df['Mission_Outcome']) #The set() function takes the Mission_Outcome column as input and generates a set containing all distinct values found in that column. This set represents the unique mission outcomes present in your data.
df['Mission_Outcome'].value_counts() # 98 success
#The value_counts() method directly operates on the Mission_Outcome column. It counts the occurrences of each unique value in the column and returns a Series (a one-dimensional Pandas data structure) where the index represents the unique outcomes and the values represent their respective counts.
"""### Task 8
##### List the names of the booster_versions which have carried the maximum payload mass. Use a subquery
"""
#Determining the Maximum Payload Mass
#The max() function is applied to the PAYLOAD_MASS__KG_ column in the df dataframe. This finds the maximum value (i.e., the heaviest payload mass) among all the payloads in the dataset.
#
max_payloadmass= df['PAYLOAD_MASS__KG_'].max()
max_payloadmass
## Filter data for booster_version with payload mass of 15600 kg.
df_boost_maxmass= df[df['PAYLOAD_MASS__KG_']==max_payloadmass] #
## Print all booster versions in the filtered data
print(set(df_boost_maxmass['Booster_Version']))
"""### Task 9
##### List the records which will display the month names, failure landing_outcomes in drone ship ,booster versions, launch_site for the months in year 2015.
**Note: SQLLite does not support monthnames. So you need to use substr(Date, 4, 2) as month to get the months and substr(Date,7,4)='2015' for year.**
"""
import datetime
# Select relevant columns
df_9=df[['Date', 'Landing_Outcome','Booster_Version','Launch_Site']]
df_9
#Extract year and month and day(as string)
df_9['Date'] = pd.to_datetime(df_9['Date'], format='%Y-%m-%d')
#Filter for year 2015 and failed drone ship landings
df_2015=df_9[df_9['Date'].dt.strftime('%Y')=='2015']
df_2015
df_month_name= df_2015[df_2015['Landing_Outcome']=='Failure (drone ship)'] #sprate landing_outcome equal failure (drone ship)
df_month_name
"""### Task 10
##### Rank the count of successful landing_outcomes between the date 04-06-2010 and 20-03-2017 in descending order.
"""
#Rank the count of successful landing_outcomes between the date 04-06-2010 and 20-03-2017 in descending order.
#Filter data:
#The code correctly filters the dataframe (df) based on the specified date range (2010-06-04 to 2017-03-20) and isolates rows with the "Success" outcome in the 'Landing_Outcome' column
df_success_outcome= df.loc[(df['Date'] > '2010-06-04') & (df['Date'] < '2017-03-20') & df['Landing_Outcome'].str.contains("Success")]
df_success_outcome
df_success_outcome['Landing_Outcome'].count() #alue_counts() method to the 'Landing_Outcome' column of the filtered dataframe.
len(df_success_outcome['Landing_Outcome'])