-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtsftask1.py
137 lines (97 loc) · 28.8 KB
/
tsftask1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# -*- coding: utf-8 -*-
"""TSFTASK1.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1GNRgbMh7bd3ous8paki4DqN0v7fvxFWU
# **Data Science TASK 1**
## **Name:** Ali Nadir
### **GRIP SEPTEMBER 2022**
---
##**Problem statement**:
>Given a dataset of 25 students containing hours studied per day and test scores, ***predict* the tentative score of a student that studied for 9.25 Hrs/Day**
###**Proposed solution:**
>Use ***supervised learning*** since plots are known
###**Importing libraries**
"""
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
"""###**Reading provided CSV dataset**"""
df=pd.read_csv("https://raw.githubusercontent.com/AdiPersonalWorks/Random/master/student_scores%20-%20student_scores.csv")
df.head()
""">Continuous data, so use regression
###**Quick scatter plot of our data**
"""
sns.lmplot(x='Hours',y='Scores',data=df)
"""Data linearly, positively correlated
>Use linear regression
###**Removing any null values**
"""
df.fillna(method='ffill',inplace=True)
"""## **Splitting the data and fitting model on it**"""
X=np.array(df['Hours']).reshape(-1,1)
y=np.array(df['Scores']).reshape(-1,1)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25)
reg=LinearRegression()
reg.fit(X_train,y_train)
print(f"Score: {reg.score(X_test,y_test)}")
"""# **Predicting from regression model**
"""
y_predict=reg.predict(X_test)
plt.scatter(X_test,y_test,color='r')
plt.plot(X_test,y_predict,color='k')
plt.title("Scores vs Hours")
plt.show()
y_val=reg.predict([[9.25]])
plt.scatter(X_test,y_test,color='r')
plt.plot(X_test,y_predict,color='k')
plt.scatter([[9.25]],y_val,color='b')
plt.ylabel("Scores")
plt.xlabel("Hours")
plt.show()
print(f"\n\nThe predicted score for 9.25 hours is {y_val[0][0]}")
df=df.append({"Hours": 9.25,"Scores":y_val[0][0]},ignore_index=True)
"""###**Predicted score for 9.25 hours stored in 2D array ```y_val```**
**Now appending to dataframe using...**
```
df=df.append({{"Hours": 9.25,"Scores":y_val[0][0]}})
```
****
****
##Let us now **sort** the dataframe and **highlight** the predicted score
"""
df.sort_values(['Scores'],inplace=True)
highlight=lambda x: ['background: black' if x.Hours in [9.25] else '' for i in x]
df.tail().style.apply(highlight,axis=1)
"""* **Appended tuple having ```index=25```**
* **Tuple highlighted in black**
> ![image.png]()
>> **using *```lambda```* function**
* **The predicted score coincides with the plot for the *test* data as well, which is a plus in accuracy!**
>![image.png]()
>>Here, blue plot denotes the predicted score for ```9.25``` hours, which is ```93.65629821373471```
"""
from sklearn.metrics import mean_absolute_error,mean_squared_error
mae = mean_absolute_error(y_true=y_test,y_pred=y_predict)
#squared True returns MSE value, False returns RMSE value.
mse = mean_squared_error(y_true=y_test,y_pred=y_predict) #default=True
rmse = mean_squared_error(y_true=y_test,y_pred=y_predict,squared=False)
print("MAE:",mae)
print("MSE:",mse)
print("RMSE:",rmse)
"""# **Model Accuracy**
>### Numerical indicator of true value and predicted value proximity
>### Evaluated using ***loss*** functions
>>* **M**ean **S**quared **E**rror
>>>```53.51594636921685```
>>* **M**ean **A**bsolute **E**rror
>>>```6.597923404255318```
>>* **R**oot **M**ean **S**quared **E**rror
>>>```7.315459409306899```
>
>## ***Lower is better***
"""