ft_linear_regression
This commit is contained in:
parent
d5134e3903
commit
6898b2c39b
|
@ -0,0 +1,83 @@
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
class LinearRegression:
|
||||||
|
def __init__(self, thetas, data, epochs = 1000, learning_rate = 0.001):
|
||||||
|
self.cost = []
|
||||||
|
self.epochs = epochs
|
||||||
|
self.learning_rate = learning_rate
|
||||||
|
self.raw_thetas = thetas
|
||||||
|
self.raw_data = data
|
||||||
|
self.__get_scaled_data()
|
||||||
|
try:
|
||||||
|
self.__get_scaled_thetas()
|
||||||
|
except:
|
||||||
|
print('error in raw_thetas format, setting thetas to 0')
|
||||||
|
self.thetas = np.zeros(self.raw_data.shape[1])
|
||||||
|
self.raw_thetas = np.zeros(self.raw_data.shape[1])
|
||||||
|
|
||||||
|
def gradient_descent(self):
|
||||||
|
for i in range (0, self.epochs):
|
||||||
|
self.thetas = self.__gradient_descent_epoch()
|
||||||
|
self.cost.append(self.get_cost())
|
||||||
|
self.raw_thetas = np.empty(len(self.thetas))
|
||||||
|
for i in range(1, self.thetas.shape[0]):
|
||||||
|
self.raw_thetas[i] = self.thetas[i] / (max(self.raw_data[:, i - 1]) - min(self.raw_data[:, i - 1]))
|
||||||
|
self.raw_thetas[0] = np.mean(self.raw_data[:, self.raw_data.shape[1] - 1])
|
||||||
|
for i in range(1, self.raw_data.shape[1]):
|
||||||
|
self.raw_thetas[0] -= self.raw_thetas[i] * np.mean(self.raw_data[:, i - 1])
|
||||||
|
|
||||||
|
def get_cost(self):
|
||||||
|
cost = 0;
|
||||||
|
for i in range (1, self.data.shape[0]):
|
||||||
|
cost += (self.__predict(i) - self.data_y[i]) ** 2
|
||||||
|
cost /= float(self.data.shape[0])
|
||||||
|
return cost
|
||||||
|
|
||||||
|
def show(self):
|
||||||
|
plt.subplot(1, 2, 1)
|
||||||
|
plt.plot(self.raw_data[:, 0], self.raw_data[:, 1], 'r.')
|
||||||
|
print(max(self.raw_data[:, 0]))
|
||||||
|
t0 = np.mean(self.data_y) - (np.mean(self.raw_data[:, 0]) * self.thetas[1])
|
||||||
|
plt.plot([0, max(self.raw_data[:, 0])], [self.raw_thetas[0], self.raw_thetas[0] + self.raw_thetas[1] * max(self.raw_data[:, 0])])
|
||||||
|
plt.ylabel('y')
|
||||||
|
plt.xlabel('x')
|
||||||
|
plt.subplot(1, 2, 2)
|
||||||
|
plt.plot(self.cost)
|
||||||
|
plt.ylabel('cost')
|
||||||
|
plt.xlabel('epochs')
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# Adds a column filled with 1 (So Theta0 * x0 = Theta0) and apply MinMax normalization to the raw data
|
||||||
|
def __get_scaled_data(self):
|
||||||
|
self.data = np.empty(shape=(self.raw_data.shape[0], self.raw_data.shape[1]))
|
||||||
|
self.data[:, 0] = 1
|
||||||
|
self.data_y = np.empty(shape=(1, self.raw_data.shape[0]))
|
||||||
|
self.data_y = self.raw_data[:, self.raw_data.shape[1] - 1]
|
||||||
|
for i in range(0, self.data.shape[1] - 1):
|
||||||
|
self.data[:, i + 1] = self.raw_data[:, i]
|
||||||
|
for i in range(1, self.data.shape[1]):
|
||||||
|
self.data[:, i] = (self.data[:, i] - min(self.data[:, i])) / (max(self.data[:, i]) - min(self.data[:, i]))
|
||||||
|
|
||||||
|
def __get_scaled_thetas(self):
|
||||||
|
self.thetas = np.empty(self.raw_data.shape[1])
|
||||||
|
self.thetas[0] = self.raw_thetas[len(self.raw_thetas) - 1]
|
||||||
|
for i in range(0, self.raw_data.shape[1] - 1):
|
||||||
|
self.thetas[i + 1] = self.raw_thetas[i + 1] * (max(self.raw_data[:, i]) - min(self.raw_data[:, i]))
|
||||||
|
|
||||||
|
def __gradient_descent_epoch(self):
|
||||||
|
new_thetas = np.zeros(self.data.shape[1])
|
||||||
|
for i in range(len(self.data)):
|
||||||
|
delta = self.__predict(i) - self.data_y[i]
|
||||||
|
for j in range(self.data.shape[1]):
|
||||||
|
new_thetas[j] += delta * self.data[i, j]
|
||||||
|
for i in range(self.data.shape[1]):
|
||||||
|
new_thetas[i] = self.thetas[i] - self.learning_rate / float(len(self.data)) * new_thetas[i]
|
||||||
|
return new_thetas
|
||||||
|
|
||||||
|
def __predict(self, row):
|
||||||
|
h = 0
|
||||||
|
for i in range(self.data.shape[1]):
|
||||||
|
h += self.thetas[i] * self.data[row, i]
|
||||||
|
return (h);
|
|
@ -0,0 +1,25 @@
|
||||||
|
km,price
|
||||||
|
240000,3650
|
||||||
|
139800,3800
|
||||||
|
150500,4400
|
||||||
|
185530,4450
|
||||||
|
176000,5250
|
||||||
|
114800,5350
|
||||||
|
166800,5800
|
||||||
|
89000,5990
|
||||||
|
144500,5999
|
||||||
|
84000,6200
|
||||||
|
82029,6390
|
||||||
|
63060,6390
|
||||||
|
74000,6600
|
||||||
|
97500,6800
|
||||||
|
67000,6800
|
||||||
|
76025,6900
|
||||||
|
48235,6900
|
||||||
|
93000,6990
|
||||||
|
60949,7490
|
||||||
|
65674,7555
|
||||||
|
54000,7990
|
||||||
|
68500,7990
|
||||||
|
22899,7990
|
||||||
|
61789,8290
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
km,price
|
||||||
|
240000,8650
|
||||||
|
139800,8800
|
||||||
|
150500,9400
|
||||||
|
185530,9450
|
||||||
|
176000,10250
|
||||||
|
114800,10350
|
||||||
|
166800,10800
|
||||||
|
89000,10990
|
||||||
|
144500,10999
|
||||||
|
84000,11200
|
||||||
|
82029,11390
|
||||||
|
63060,11390
|
||||||
|
74000,11600
|
||||||
|
97500,11800
|
||||||
|
67000,11800
|
||||||
|
76025,11900
|
||||||
|
48235,11900
|
||||||
|
93000,11990
|
||||||
|
60949,12490
|
||||||
|
65674,12555
|
||||||
|
54000,12990
|
||||||
|
68500,12990
|
||||||
|
22899,12990
|
||||||
|
61789,13290
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
age,km,price
|
||||||
|
10,240000,8050
|
||||||
|
15,139800,8200
|
||||||
|
18,150500,7400
|
||||||
|
2,185530,9250
|
||||||
|
6,176000,9950
|
||||||
|
8,114800,9950
|
||||||
|
10,166800,9800
|
||||||
|
25,89000,7990
|
||||||
|
4,144500,10499
|
||||||
|
1,84000,11000
|
||||||
|
0,82029,12000
|
||||||
|
2,63060,11300
|
||||||
|
7,74000,11000
|
||||||
|
4,97500,11000
|
||||||
|
9,67000,10000
|
||||||
|
4,76025,11200
|
||||||
|
4,48235,11300
|
||||||
|
7,93000,10500
|
||||||
|
1,60949,12300
|
||||||
|
6,65674,11800
|
||||||
|
5,54000,12500
|
||||||
|
4,68500,12600
|
||||||
|
2,22899,12800
|
||||||
|
7,61789,12590
|
|
|
@ -0,0 +1,86 @@
|
||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
from train import train
|
||||||
|
from LinearRegression import LinearRegression
|
||||||
|
from matplotlib import animation
|
||||||
|
|
||||||
|
|
||||||
|
def get_thetas(path):
|
||||||
|
try:
|
||||||
|
thetas = np.genfromtxt(path, delimiter=',')
|
||||||
|
except:
|
||||||
|
print ('info: path not found, asking for train')
|
||||||
|
return "error"
|
||||||
|
try:
|
||||||
|
return [thetas[0], thetas[1]]
|
||||||
|
except:
|
||||||
|
print ('warning: path in wrong format, asking for train')
|
||||||
|
return "error"
|
||||||
|
|
||||||
|
def get_price(mileage, thetas):
|
||||||
|
return (thetas[0] + mileage * thetas[1])
|
||||||
|
|
||||||
|
def predict_subject():
|
||||||
|
thetas = get_thetas('thetas.csv')
|
||||||
|
if thetas == "error":
|
||||||
|
print("")
|
||||||
|
try:
|
||||||
|
input_thetas = input("model isn't trained, would you like to train it before predicting car price ? y/n\n");
|
||||||
|
if (input_thetas == 'y'):
|
||||||
|
raw_data = np.genfromtxt('data.csv', delimiter=',', skip_header=1)
|
||||||
|
train(raw_data, np.zeros(2), 'thetas.csv', False);
|
||||||
|
thetas = np.genfromtxt('thetas.csv', delimiter=',')
|
||||||
|
else:
|
||||||
|
print ('info: wrong input format, setting thetas to 0')
|
||||||
|
thetas = [0, 0]
|
||||||
|
except:
|
||||||
|
print ('info: wrong input format or fail to train, setting thetas to 0')
|
||||||
|
thetas = [0, 0]
|
||||||
|
try:
|
||||||
|
mileage = int(eval(input("Enter mileage\n")))
|
||||||
|
except:
|
||||||
|
print ('info: input a number')
|
||||||
|
return
|
||||||
|
if mileage < 0:
|
||||||
|
print ('info: mileage should be superior to 0 ! aborting')
|
||||||
|
return
|
||||||
|
price = get_price(mileage, thetas)
|
||||||
|
if price < 0:
|
||||||
|
print('This car belongs in a museum ! (price inferior to 0)')
|
||||||
|
else:
|
||||||
|
print('Predicted car value is ', price,)
|
||||||
|
|
||||||
|
def get_y(x, thetas):
|
||||||
|
h = 0
|
||||||
|
for i in range(len(x)):
|
||||||
|
h += x[i] * thetas[i]
|
||||||
|
return h
|
||||||
|
|
||||||
|
def predict(thetas_path):
|
||||||
|
try:
|
||||||
|
thetas = np.genfromtxt(thetas_path,delimiter=',')
|
||||||
|
except:
|
||||||
|
print('wrong name or format')
|
||||||
|
return
|
||||||
|
x = np.empty(len(thetas) - 1)
|
||||||
|
x[0] = 1;
|
||||||
|
for i in range(1 ,len(thetas) - 1):
|
||||||
|
try:
|
||||||
|
s = "Enter feature " + str(i) + "\n"
|
||||||
|
x[i] = int(eval(input(s)))
|
||||||
|
except:
|
||||||
|
print('input a number')
|
||||||
|
return
|
||||||
|
print("y is equal to ", get_y(x, thetas))
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description='DSLR is a 2 day project, if you do Linear Regression in two weeks :pepethefrog:')
|
||||||
|
parser.add_argument("-p", "--path", type=str, default=False, help="thetas file path")
|
||||||
|
args = parser.parse_args()
|
||||||
|
if (args.path == False):
|
||||||
|
predict_subject()
|
||||||
|
else:
|
||||||
|
predict(args.path)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
|
@ -0,0 +1,55 @@
|
||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
from LinearRegression import LinearRegression
|
||||||
|
from matplotlib import animation
|
||||||
|
|
||||||
|
def get_thetas():
|
||||||
|
try:
|
||||||
|
thetas = np.genfromtxt('thetas.csv', delimiter=',')
|
||||||
|
except:
|
||||||
|
print ("info: thetas.csv not found, setting thetas to 0")
|
||||||
|
return [0, 0]
|
||||||
|
try:
|
||||||
|
return [thetas[2], thetas[1]]
|
||||||
|
except:
|
||||||
|
print ("warning: thetas.csv in wrong format, setting thetas to 0")
|
||||||
|
return [0, 0]
|
||||||
|
|
||||||
|
def train(raw_data, raw_thetas, thetas_path, visu):
|
||||||
|
lr = LinearRegression(thetas = raw_thetas, data = raw_data, epochs = 10000, learning_rate = 0.1)
|
||||||
|
print(lr.raw_data)
|
||||||
|
print("thetas before train")
|
||||||
|
print(lr.raw_thetas[:len(lr.raw_thetas) - 1])
|
||||||
|
lr.gradient_descent()
|
||||||
|
print("thetas after train")
|
||||||
|
print(lr.raw_thetas)
|
||||||
|
tosave = []
|
||||||
|
for i in range(len(lr.raw_thetas)):
|
||||||
|
tosave.append(lr.raw_thetas[i])
|
||||||
|
tosave.append(lr.thetas[0])
|
||||||
|
np.savetxt(thetas_path, tosave, delimiter=',')
|
||||||
|
if (visu and len(lr.thetas) == 2):
|
||||||
|
lr.show()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description='DSLR is a 2 day project, if you do Linear Regression in two weeks :pepethefrog:')
|
||||||
|
parser.add_argument("-p", "--path", type=str, default='data.csv', help="data file path")
|
||||||
|
parser.add_argument("-t", "--thetas", type=str, default=False, help="thetas file path")
|
||||||
|
parser.add_argument("--visu", default=False, help="plot data on graph", action="store_true")
|
||||||
|
args = parser.parse_args()
|
||||||
|
if (args.thetas == False):
|
||||||
|
args.thetas = 'thetas.csv'
|
||||||
|
try:
|
||||||
|
raw_data = np.genfromtxt(args.path, delimiter=',', skip_header=1)
|
||||||
|
except:
|
||||||
|
print('csv file not found or wrong')
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
raw_thetas = np.genfromtxt(args.thetas, delimiter=',')
|
||||||
|
except:
|
||||||
|
print('thetas file not found or wrong, setting thetas to 0')
|
||||||
|
raw_thetas = np.zeros(raw_data.shape[1])
|
||||||
|
train(raw_data, raw_thetas, args.thetas, args.visu)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue