import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statistics


cars = pd.read_csv("Car details v3.csv")
cars.dropna(inplace=True)
cars.head()

/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_606/3782334901.py:2: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd

cars.dropna(inplace=True)
cars.drop_duplicates(inplace=True)
cars.drop(['year', 'fuel', 'seller_type', 'transmission', 'torque', 'seats','max_power'], axis=1, inplace=True)
cars= cars[cars['owner'] != 'Test Drive Car']

cars.head()

def is_outlier(x):
    Q25, Q75 = x.quantile([.25,.75])
    I = Q75 - Q25
    return (x < Q25 - 1.5*I) |  (x > Q75 + 1.5*I)

def IQR_outliers(x):
    IQR = x.quantile(.75) - x.quantile(.25)
    return IQR

outliers_selling_price = cars['selling_price'].loc[is_outlier(cars['selling_price'])]
IQR_selling_price = IQR_outliers(cars['selling_price'])
print("There are " + str(outliers_selling_price.count()) + " outliers in the SELLING PRICE column with an IQR of " + str(IQR_selling_price) + " given that the 25 percentile and 75 percentile are " + str(cars['selling_price'].quantile(.25)) + " and " + str(cars['selling_price'].quantile(.75)) + " respectively.")


outliers_selling_price = cars['km_driven'].loc[is_outlier(cars['km_driven'])]
IQR_selling_price = IQR_outliers(cars['km_driven'])
print("There are " + str(outliers_selling_price.count()) + " outliers in the KILOMETERS DRIVEN   column with an IQR of " + str(IQR_selling_price) + " given that the 25 percentile and 75 percentile are " + str(cars['km_driven'].quantile(.25)) + " and " + str(cars['km_driven'].quantile(.75)) + " respectively.")

cars['mileage_numeric'] = cars['mileage'].str.extract('(\d+.\d+)').astype(float)
outliers_mileage = cars['mileage_numeric'].loc[is_outlier(cars['mileage_numeric'])]
IQR_mileage = IQR_outliers(cars['mileage_numeric'])
print("There are " + str(outliers_mileage.count()) + " outliers in the MILEAGE column with an IQR of " + str(IQR_mileage) + " given that the 25 percentile and 75 percentile are " + str(cars['mileage_numeric'].quantile(.25)) + " and " + str(cars['mileage_numeric'].quantile(.75)) + " respectively.")

cars['engine_numeric'] = cars['engine'].str.extract('(\d+)').astype(int)
outliers_engine = cars['engine_numeric'].loc[is_outlier(cars['engine_numeric'])]
IQR_engine = IQR_outliers(cars['engine_numeric'])
print("There are " + str(outliers_engine.count()) + " outliers in the ENGINE column with an IQR of " + str(IQR_engine) + " given that the 25 percentile and 75 percentile are " + str(cars['engine_numeric'].quantile(.25)) + " and " + str(cars['engine_numeric'].quantile(.75)) + " respectively.")

# Remove rows with outliers in the selling_price column
cars = cars.loc[~cars['selling_price'].isin(outliers_selling_price)]

# Remove rows with outliers in the km_driven column
cars = cars.loc[~cars['km_driven'].isin(outliers_selling_price)]

# Remove rows with outliers in the mileage_numeric column
cars = cars.loc[~cars['mileage_numeric'].isin(outliers_mileage)]

There are 301 outliers in the SELLING PRICE column with an IQR of 400000.0 given that the 25 percentile and 75 percentile are 250000.0 and 650000.0 respectively.
There are 161 outliers in the KILOMETERS DRIVEN   column with an IQR of 62000.0 given that the 25 percentile and 75 percentile are 38000.0 and 100000.0 respectively.
There are 21 outliers in the MILEAGE column with an IQR of 5.709999999999997 given that the 25 percentile and 75 percentile are 16.8 and 22.509999999999998 respectively.
There are 1207 outliers in the ENGINE column with an IQR of 301.0 given that the 25 percentile and 75 percentile are 1197.0 and 1498.0 respectively.

# Extract the first word (car brand) from the name column
cars['car_brand'] = cars['name'].str.split().str[0]
# Get the most common car brand
most_common_brand = cars['car_brand'].value_counts().idxmax()
# Get the most uncommon car brand
most_uncommon_brand = cars['car_brand'].value_counts().idxmin()
print("Most common car brand:", most_common_brand)
print("Most uncommon car brand:", most_uncommon_brand)
average_cars_per_brand = car_brand_counts.mean()
print("Average number of cars per brand:", average_cars_per_brand)
car_brand_counts = cars['car_brand'].value_counts()
print(car_brand_counts)

Most common car brand: Maruti
Most uncommon car brand: Lexus

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 9
      7 print("Most common car brand:", most_common_brand)
      8 print("Most uncommon car brand:", most_uncommon_brand)
----> 9 average_cars_per_brand = car_brand_counts.mean()
     10 print("Average number of cars per brand:", average_cars_per_brand)
     11 car_brand_counts = cars['car_brand'].value_counts()

NameError: name 'car_brand_counts' is not defined

summary_stats = cars['selling_price'].describe()
print(summary_stats)

plt.figure(figsize=(7,7))
plt.boxplot(cars['selling_price'], patch_artist=True)
plt.title('Summary Statistics for Selling Price')
plt.xlabel('Selling Price')
plt.ylabel('Values')
plt.grid(True)
plt.show()

count    5.083000e+03
mean     5.844928e+05
std      5.457134e+05
min      2.999900e+04
25%      3.100000e+05
50%      5.030000e+05
75%      7.000000e+05
max      7.200000e+06
Name: selling_price, dtype: float64

summary_stats_km_driven = cars['km_driven'].describe()
print(summary_stats_km_driven)

plt.figure(figsize=(7,7))
plt.boxplot(cars['km_driven'], patch_artist=True)
plt.title('Summary Statistics for Kilometers Driven')
plt.xlabel('Kilometers Driven')
plt.ylabel('Values')
plt.grid(True)
plt.show()

count      5083.000000
mean      66775.135943
std       39678.236529
min           1.000000
25%       35000.000000
50%       60000.000000
75%       93000.000000
max      192000.000000
Name: km_driven, dtype: float64

owner_counts = cars['owner'].value_counts()
print(owner_counts)

owner
First Owner             3281
Second Owner            1339
Third Owner              348
Fourth & Above Owner     115
Name: count, dtype: int64

average_owner = (1*4176 + 2*1888 + 3*493 + 4*155)/(4176 + 1888 + 493 + 155)
print("The Average owner number for the cars in the dataset is :", average_owner)
owner_numbers = [4176, 1888, 493, 155]
mode_owner = statistics.mode(owner_numbers)
print("The mode owner number for the cars in the dataset is :", mode_owner)

x_labels = ['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner']
y_values = owner_numbers

plt.bar(x_labels, y_values)
plt.xlabel('Owner Categories')
plt.ylabel('Number of Cars')
plt.title('Number of Cars by Owner Category')
plt.show()

The Average owner number for the cars in the dataset is : 1.4974672228843862
The mode owner number for the cars in the dataset is : 4176

summary_stats_km_driven = cars['mileage_numeric'].describe()
print(summary_stats_km_driven)

plt.figure(figsize=(7,7))
plt.boxplot(cars['mileage_numeric'], patch_artist=True)
plt.title('Summary Statistics for Mileage')
plt.xlabel('Mileage')
plt.ylabel('Values')
plt.grid(True)
plt.show()

count    5083.000000
mean       19.558013
std         3.989036
min         9.000000
25%        16.800000
50%        19.400000
75%        22.540000
max        30.460000
Name: mileage_numeric, dtype: float64

summary_stats_km_driven = cars['engine_numeric'].describe()
print(summary_stats_km_driven)

plt.figure(figsize=(7,7))
plt.boxplot(cars['engine_numeric'], patch_artist=True)
plt.title('Summary Statistics for Engine Capacity')
plt.xlabel('Engine Capacity')
plt.ylabel('Values')
plt.grid(True)
plt.show()

count    5083.000000
mean     1442.973638
std       489.337612
min       624.000000
25%      1197.000000
50%      1248.000000
75%      1498.000000
max      3604.000000
Name: engine_numeric, dtype: float64

correlation_km_driven = cars['selling_price'].corr(cars['km_driven'])
correlation_mileage = cars['selling_price'].corr(cars['mileage_numeric'])
correlation_engine = cars['selling_price'].corr(cars['engine_numeric'])

# Scatter plot with line of best fit for correlation_km_driven
sns.lmplot(x='km_driven', y='selling_price', data=cars)
plt.title('Correlation between selling_price and Kilometers Driven')
plt.xlabel('Kilometers Driven')
plt.ylabel('Selling Price')
plt.show()
print("Correlation between selling_price and Kilometers Driven:", correlation_km_driven)

# Scatter plot with line of best fit for correlation_mileage
sns.lmplot(x='mileage_numeric', y='selling_price', data=cars)
plt.title('Correlation between selling_price and Mileage')
plt.xlabel('Mileage')
plt.ylabel('Selling Price')
plt.show()
print("Correlation between selling_price and Mileage:", correlation_mileage)


# Scatter plot with line of best fit for correlation_engine
sns.lmplot(x='engine_numeric', y='selling_price', data=cars)
plt.title('Correlation between selling_price and Engine Capacity')
plt.xlabel('Engine Capacity')
plt.ylabel('Selling Price')
plt.show()


print("Correlation between selling_price and Engine Capacity:", correlation_engine)

Correlation between selling_price and Kilometers Driven: -0.2284780764094624

Correlation between selling_price and Mileage: -0.13964557945780787

Correlation between selling_price and Engine Capacity: 0.5057409656831157

	name	year	selling_price	km_driven	fuel	seller_type	transmission	owner	mileage	engine	max_power	torque	seats
0	Maruti Swift Dzire VDI	2014	450000	145500	Diesel	Individual	Manual	First Owner	23.4 kmpl	1248 CC	74 bhp	190Nm@ 2000rpm	5.0
1	Skoda Rapid 1.5 TDI Ambition	2014	370000	120000	Diesel	Individual	Manual	Second Owner	21.14 kmpl	1498 CC	103.52 bhp	250Nm@ 1500-2500rpm	5.0
2	Honda City 2017-2020 EXi	2006	158000	140000	Petrol	Individual	Manual	Third Owner	17.7 kmpl	1497 CC	78 bhp	12.7@ 2,700(kgm@ rpm)	5.0
3	Hyundai i20 Sportz Diesel	2010	225000	127000	Diesel	Individual	Manual	First Owner	23.0 kmpl	1396 CC	90 bhp	22.4 kgm at 1750-2750rpm	5.0
4	Maruti Swift VXI BSIII	2007	130000	120000	Petrol	Individual	Manual	First Owner	16.1 kmpl	1298 CC	88.2 bhp	11.5@ 4,500(kgm@ rpm)	5.0

	name	selling_price	km_driven	owner	mileage	engine
0	Maruti Swift Dzire VDI	450000	145500	First Owner	23.4 kmpl	1248 CC
1	Skoda Rapid 1.5 TDI Ambition	370000	120000	Second Owner	21.14 kmpl	1498 CC
2	Honda City 2017-2020 EXi	158000	140000	Third Owner	17.7 kmpl	1497 CC
3	Hyundai i20 Sportz Diesel	225000	127000	First Owner	23.0 kmpl	1396 CC
4	Maruti Swift VXI BSIII	130000	120000	First Owner	16.1 kmpl	1298 CC

Project 1¶

1. Introduction¶

2. Preproccessing¶

3. Summary Data Analysis¶

Name column¶

Selling Price Column¶

Kilometers Driven Column¶

Owner Column¶

Mileage Column¶

Engine Column¶

4. Discussion¶