ft_linear_regression

This commit is contained in:
gbrochar 2020-11-22 18:05:40 +01:00
parent d5134e3903
commit 6898b2c39b
6 changed files with 299 additions and 0 deletions

83
LinearRegression.py Normal file
View File

@ -0,0 +1,83 @@
import numpy as np
import matplotlib.pyplot as plt
class LinearRegression:
def __init__(self, thetas, data, epochs = 1000, learning_rate = 0.001):
self.cost = []
self.epochs = epochs
self.learning_rate = learning_rate
self.raw_thetas = thetas
self.raw_data = data
self.__get_scaled_data()
try:
self.__get_scaled_thetas()
except:
print('error in raw_thetas format, setting thetas to 0')
self.thetas = np.zeros(self.raw_data.shape[1])
self.raw_thetas = np.zeros(self.raw_data.shape[1])
def gradient_descent(self):
for i in range (0, self.epochs):
self.thetas = self.__gradient_descent_epoch()
self.cost.append(self.get_cost())
self.raw_thetas = np.empty(len(self.thetas))
for i in range(1, self.thetas.shape[0]):
self.raw_thetas[i] = self.thetas[i] / (max(self.raw_data[:, i - 1]) - min(self.raw_data[:, i - 1]))
self.raw_thetas[0] = np.mean(self.raw_data[:, self.raw_data.shape[1] - 1])
for i in range(1, self.raw_data.shape[1]):
self.raw_thetas[0] -= self.raw_thetas[i] * np.mean(self.raw_data[:, i - 1])
def get_cost(self):
cost = 0;
for i in range (1, self.data.shape[0]):
cost += (self.__predict(i) - self.data_y[i]) ** 2
cost /= float(self.data.shape[0])
return cost
def show(self):
plt.subplot(1, 2, 1)
plt.plot(self.raw_data[:, 0], self.raw_data[:, 1], 'r.')
print(max(self.raw_data[:, 0]))
t0 = np.mean(self.data_y) - (np.mean(self.raw_data[:, 0]) * self.thetas[1])
plt.plot([0, max(self.raw_data[:, 0])], [self.raw_thetas[0], self.raw_thetas[0] + self.raw_thetas[1] * max(self.raw_data[:, 0])])
plt.ylabel('y')
plt.xlabel('x')
plt.subplot(1, 2, 2)
plt.plot(self.cost)
plt.ylabel('cost')
plt.xlabel('epochs')
plt.tight_layout()
plt.show()
# Adds a column filled with 1 (So Theta0 * x0 = Theta0) and apply MinMax normalization to the raw data
def __get_scaled_data(self):
self.data = np.empty(shape=(self.raw_data.shape[0], self.raw_data.shape[1]))
self.data[:, 0] = 1
self.data_y = np.empty(shape=(1, self.raw_data.shape[0]))
self.data_y = self.raw_data[:, self.raw_data.shape[1] - 1]
for i in range(0, self.data.shape[1] - 1):
self.data[:, i + 1] = self.raw_data[:, i]
for i in range(1, self.data.shape[1]):
self.data[:, i] = (self.data[:, i] - min(self.data[:, i])) / (max(self.data[:, i]) - min(self.data[:, i]))
def __get_scaled_thetas(self):
self.thetas = np.empty(self.raw_data.shape[1])
self.thetas[0] = self.raw_thetas[len(self.raw_thetas) - 1]
for i in range(0, self.raw_data.shape[1] - 1):
self.thetas[i + 1] = self.raw_thetas[i + 1] * (max(self.raw_data[:, i]) - min(self.raw_data[:, i]))
def __gradient_descent_epoch(self):
new_thetas = np.zeros(self.data.shape[1])
for i in range(len(self.data)):
delta = self.__predict(i) - self.data_y[i]
for j in range(self.data.shape[1]):
new_thetas[j] += delta * self.data[i, j]
for i in range(self.data.shape[1]):
new_thetas[i] = self.thetas[i] - self.learning_rate / float(len(self.data)) * new_thetas[i]
return new_thetas
def __predict(self, row):
h = 0
for i in range(self.data.shape[1]):
h += self.thetas[i] * self.data[row, i]
return (h);

25
data.csv Normal file
View File

@ -0,0 +1,25 @@
km,price
240000,3650
139800,3800
150500,4400
185530,4450
176000,5250
114800,5350
166800,5800
89000,5990
144500,5999
84000,6200
82029,6390
63060,6390
74000,6600
97500,6800
67000,6800
76025,6900
48235,6900
93000,6990
60949,7490
65674,7555
54000,7990
68500,7990
22899,7990
61789,8290
1 km price
2 240000 3650
3 139800 3800
4 150500 4400
5 185530 4450
6 176000 5250
7 114800 5350
8 166800 5800
9 89000 5990
10 144500 5999
11 84000 6200
12 82029 6390
13 63060 6390
14 74000 6600
15 97500 6800
16 67000 6800
17 76025 6900
18 48235 6900
19 93000 6990
20 60949 7490
21 65674 7555
22 54000 7990
23 68500 7990
24 22899 7990
25 61789 8290

25
data_offset.csv Normal file
View File

@ -0,0 +1,25 @@
km,price
240000,8650
139800,8800
150500,9400
185530,9450
176000,10250
114800,10350
166800,10800
89000,10990
144500,10999
84000,11200
82029,11390
63060,11390
74000,11600
97500,11800
67000,11800
76025,11900
48235,11900
93000,11990
60949,12490
65674,12555
54000,12990
68500,12990
22899,12990
61789,13290
1 km price
2 240000 8650
3 139800 8800
4 150500 9400
5 185530 9450
6 176000 10250
7 114800 10350
8 166800 10800
9 89000 10990
10 144500 10999
11 84000 11200
12 82029 11390
13 63060 11390
14 74000 11600
15 97500 11800
16 67000 11800
17 76025 11900
18 48235 11900
19 93000 11990
20 60949 12490
21 65674 12555
22 54000 12990
23 68500 12990
24 22899 12990
25 61789 13290

25
multi_data.csv Normal file
View File

@ -0,0 +1,25 @@
age,km,price
10,240000,8050
15,139800,8200
18,150500,7400
2,185530,9250
6,176000,9950
8,114800,9950
10,166800,9800
25,89000,7990
4,144500,10499
1,84000,11000
0,82029,12000
2,63060,11300
7,74000,11000
4,97500,11000
9,67000,10000
4,76025,11200
4,48235,11300
7,93000,10500
1,60949,12300
6,65674,11800
5,54000,12500
4,68500,12600
2,22899,12800
7,61789,12590
1 age km price
2 10 240000 8050
3 15 139800 8200
4 18 150500 7400
5 2 185530 9250
6 6 176000 9950
7 8 114800 9950
8 10 166800 9800
9 25 89000 7990
10 4 144500 10499
11 1 84000 11000
12 0 82029 12000
13 2 63060 11300
14 7 74000 11000
15 4 97500 11000
16 9 67000 10000
17 4 76025 11200
18 4 48235 11300
19 7 93000 10500
20 1 60949 12300
21 6 65674 11800
22 5 54000 12500
23 4 68500 12600
24 2 22899 12800
25 7 61789 12590

86
predict.py Normal file
View File

@ -0,0 +1,86 @@
import argparse
import numpy as np
from train import train
from LinearRegression import LinearRegression
from matplotlib import animation
def get_thetas(path):
try:
thetas = np.genfromtxt(path, delimiter=',')
except:
print ('info: path not found, asking for train')
return "error"
try:
return [thetas[0], thetas[1]]
except:
print ('warning: path in wrong format, asking for train')
return "error"
def get_price(mileage, thetas):
return (thetas[0] + mileage * thetas[1])
def predict_subject():
thetas = get_thetas('thetas.csv')
if thetas == "error":
print("")
try:
input_thetas = input("model isn't trained, would you like to train it before predicting car price ? y/n\n");
if (input_thetas == 'y'):
raw_data = np.genfromtxt('data.csv', delimiter=',', skip_header=1)
train(raw_data, np.zeros(2), 'thetas.csv', False);
thetas = np.genfromtxt('thetas.csv', delimiter=',')
else:
print ('info: wrong input format, setting thetas to 0')
thetas = [0, 0]
except:
print ('info: wrong input format or fail to train, setting thetas to 0')
thetas = [0, 0]
try:
mileage = int(eval(input("Enter mileage\n")))
except:
print ('info: input a number')
return
if mileage < 0:
print ('info: mileage should be superior to 0 ! aborting')
return
price = get_price(mileage, thetas)
if price < 0:
print('This car belongs in a museum ! (price inferior to 0)')
else:
print('Predicted car value is ', price,)
def get_y(x, thetas):
h = 0
for i in range(len(x)):
h += x[i] * thetas[i]
return h
def predict(thetas_path):
try:
thetas = np.genfromtxt(thetas_path,delimiter=',')
except:
print('wrong name or format')
return
x = np.empty(len(thetas) - 1)
x[0] = 1;
for i in range(1 ,len(thetas) - 1):
try:
s = "Enter feature " + str(i) + "\n"
x[i] = int(eval(input(s)))
except:
print('input a number')
return
print("y is equal to ", get_y(x, thetas))
def main():
parser = argparse.ArgumentParser(description='DSLR is a 2 day project, if you do Linear Regression in two weeks :pepethefrog:')
parser.add_argument("-p", "--path", type=str, default=False, help="thetas file path")
args = parser.parse_args()
if (args.path == False):
predict_subject()
else:
predict(args.path)
if __name__ == '__main__':
main()

55
train.py Normal file
View File

@ -0,0 +1,55 @@
import argparse
import numpy as np
from LinearRegression import LinearRegression
from matplotlib import animation
def get_thetas():
try:
thetas = np.genfromtxt('thetas.csv', delimiter=',')
except:
print ("info: thetas.csv not found, setting thetas to 0")
return [0, 0]
try:
return [thetas[2], thetas[1]]
except:
print ("warning: thetas.csv in wrong format, setting thetas to 0")
return [0, 0]
def train(raw_data, raw_thetas, thetas_path, visu):
lr = LinearRegression(thetas = raw_thetas, data = raw_data, epochs = 10000, learning_rate = 0.1)
print(lr.raw_data)
print("thetas before train")
print(lr.raw_thetas[:len(lr.raw_thetas) - 1])
lr.gradient_descent()
print("thetas after train")
print(lr.raw_thetas)
tosave = []
for i in range(len(lr.raw_thetas)):
tosave.append(lr.raw_thetas[i])
tosave.append(lr.thetas[0])
np.savetxt(thetas_path, tosave, delimiter=',')
if (visu and len(lr.thetas) == 2):
lr.show()
def main():
parser = argparse.ArgumentParser(description='DSLR is a 2 day project, if you do Linear Regression in two weeks :pepethefrog:')
parser.add_argument("-p", "--path", type=str, default='data.csv', help="data file path")
parser.add_argument("-t", "--thetas", type=str, default=False, help="thetas file path")
parser.add_argument("--visu", default=False, help="plot data on graph", action="store_true")
args = parser.parse_args()
if (args.thetas == False):
args.thetas = 'thetas.csv'
try:
raw_data = np.genfromtxt(args.path, delimiter=',', skip_header=1)
except:
print('csv file not found or wrong')
return
try:
raw_thetas = np.genfromtxt(args.thetas, delimiter=',')
except:
print('thetas file not found or wrong, setting thetas to 0')
raw_thetas = np.zeros(raw_data.shape[1])
train(raw_data, raw_thetas, args.thetas, args.visu)
if __name__ == '__main__':
main()