From 6898b2c39bd283344f6cec7baab7ebc75436779c Mon Sep 17 00:00:00 2001 From: gbrochar Date: Sun, 22 Nov 2020 18:05:40 +0100 Subject: [PATCH] ft_linear_regression --- LinearRegression.py | 83 +++++++++++++++++++++++++++++++++++++++++++ data.csv | 25 +++++++++++++ data_offset.csv | 25 +++++++++++++ multi_data.csv | 25 +++++++++++++ predict.py | 86 +++++++++++++++++++++++++++++++++++++++++++++ train.py | 55 +++++++++++++++++++++++++++++ 6 files changed, 299 insertions(+) create mode 100644 LinearRegression.py create mode 100644 data.csv create mode 100644 data_offset.csv create mode 100644 multi_data.csv create mode 100644 predict.py create mode 100644 train.py diff --git a/LinearRegression.py b/LinearRegression.py new file mode 100644 index 0000000..5ee6b92 --- /dev/null +++ b/LinearRegression.py @@ -0,0 +1,83 @@ +import numpy as np +import matplotlib.pyplot as plt + +class LinearRegression: + def __init__(self, thetas, data, epochs = 1000, learning_rate = 0.001): + self.cost = [] + self.epochs = epochs + self.learning_rate = learning_rate + self.raw_thetas = thetas + self.raw_data = data + self.__get_scaled_data() + try: + self.__get_scaled_thetas() + except: + print('error in raw_thetas format, setting thetas to 0') + self.thetas = np.zeros(self.raw_data.shape[1]) + self.raw_thetas = np.zeros(self.raw_data.shape[1]) + + def gradient_descent(self): + for i in range (0, self.epochs): + self.thetas = self.__gradient_descent_epoch() + self.cost.append(self.get_cost()) + self.raw_thetas = np.empty(len(self.thetas)) + for i in range(1, self.thetas.shape[0]): + self.raw_thetas[i] = self.thetas[i] / (max(self.raw_data[:, i - 1]) - min(self.raw_data[:, i - 1])) + self.raw_thetas[0] = np.mean(self.raw_data[:, self.raw_data.shape[1] - 1]) + for i in range(1, self.raw_data.shape[1]): + self.raw_thetas[0] -= self.raw_thetas[i] * np.mean(self.raw_data[:, i - 1]) + + def get_cost(self): + cost = 0; + for i in range (1, self.data.shape[0]): + cost += (self.__predict(i) - self.data_y[i]) ** 2 + cost /= float(self.data.shape[0]) + return cost + + def show(self): + plt.subplot(1, 2, 1) + plt.plot(self.raw_data[:, 0], self.raw_data[:, 1], 'r.') + print(max(self.raw_data[:, 0])) + t0 = np.mean(self.data_y) - (np.mean(self.raw_data[:, 0]) * self.thetas[1]) + plt.plot([0, max(self.raw_data[:, 0])], [self.raw_thetas[0], self.raw_thetas[0] + self.raw_thetas[1] * max(self.raw_data[:, 0])]) + plt.ylabel('y') + plt.xlabel('x') + plt.subplot(1, 2, 2) + plt.plot(self.cost) + plt.ylabel('cost') + plt.xlabel('epochs') + plt.tight_layout() + plt.show() + +# Adds a column filled with 1 (So Theta0 * x0 = Theta0) and apply MinMax normalization to the raw data + def __get_scaled_data(self): + self.data = np.empty(shape=(self.raw_data.shape[0], self.raw_data.shape[1])) + self.data[:, 0] = 1 + self.data_y = np.empty(shape=(1, self.raw_data.shape[0])) + self.data_y = self.raw_data[:, self.raw_data.shape[1] - 1] + for i in range(0, self.data.shape[1] - 1): + self.data[:, i + 1] = self.raw_data[:, i] + for i in range(1, self.data.shape[1]): + self.data[:, i] = (self.data[:, i] - min(self.data[:, i])) / (max(self.data[:, i]) - min(self.data[:, i])) + + def __get_scaled_thetas(self): + self.thetas = np.empty(self.raw_data.shape[1]) + self.thetas[0] = self.raw_thetas[len(self.raw_thetas) - 1] + for i in range(0, self.raw_data.shape[1] - 1): + self.thetas[i + 1] = self.raw_thetas[i + 1] * (max(self.raw_data[:, i]) - min(self.raw_data[:, i])) + + def __gradient_descent_epoch(self): + new_thetas = np.zeros(self.data.shape[1]) + for i in range(len(self.data)): + delta = self.__predict(i) - self.data_y[i] + for j in range(self.data.shape[1]): + new_thetas[j] += delta * self.data[i, j] + for i in range(self.data.shape[1]): + new_thetas[i] = self.thetas[i] - self.learning_rate / float(len(self.data)) * new_thetas[i] + return new_thetas + + def __predict(self, row): + h = 0 + for i in range(self.data.shape[1]): + h += self.thetas[i] * self.data[row, i] + return (h); diff --git a/data.csv b/data.csv new file mode 100644 index 0000000..b875289 --- /dev/null +++ b/data.csv @@ -0,0 +1,25 @@ +km,price +240000,3650 +139800,3800 +150500,4400 +185530,4450 +176000,5250 +114800,5350 +166800,5800 +89000,5990 +144500,5999 +84000,6200 +82029,6390 +63060,6390 +74000,6600 +97500,6800 +67000,6800 +76025,6900 +48235,6900 +93000,6990 +60949,7490 +65674,7555 +54000,7990 +68500,7990 +22899,7990 +61789,8290 diff --git a/data_offset.csv b/data_offset.csv new file mode 100644 index 0000000..e6a4f17 --- /dev/null +++ b/data_offset.csv @@ -0,0 +1,25 @@ +km,price +240000,8650 +139800,8800 +150500,9400 +185530,9450 +176000,10250 +114800,10350 +166800,10800 +89000,10990 +144500,10999 +84000,11200 +82029,11390 +63060,11390 +74000,11600 +97500,11800 +67000,11800 +76025,11900 +48235,11900 +93000,11990 +60949,12490 +65674,12555 +54000,12990 +68500,12990 +22899,12990 +61789,13290 diff --git a/multi_data.csv b/multi_data.csv new file mode 100644 index 0000000..1b778f8 --- /dev/null +++ b/multi_data.csv @@ -0,0 +1,25 @@ +age,km,price +10,240000,8050 +15,139800,8200 +18,150500,7400 +2,185530,9250 +6,176000,9950 +8,114800,9950 +10,166800,9800 +25,89000,7990 +4,144500,10499 +1,84000,11000 +0,82029,12000 +2,63060,11300 +7,74000,11000 +4,97500,11000 +9,67000,10000 +4,76025,11200 +4,48235,11300 +7,93000,10500 +1,60949,12300 +6,65674,11800 +5,54000,12500 +4,68500,12600 +2,22899,12800 +7,61789,12590 diff --git a/predict.py b/predict.py new file mode 100644 index 0000000..44d0fe6 --- /dev/null +++ b/predict.py @@ -0,0 +1,86 @@ +import argparse +import numpy as np +from train import train +from LinearRegression import LinearRegression +from matplotlib import animation + + +def get_thetas(path): + try: + thetas = np.genfromtxt(path, delimiter=',') + except: + print ('info: path not found, asking for train') + return "error" + try: + return [thetas[0], thetas[1]] + except: + print ('warning: path in wrong format, asking for train') + return "error" + +def get_price(mileage, thetas): + return (thetas[0] + mileage * thetas[1]) + +def predict_subject(): + thetas = get_thetas('thetas.csv') + if thetas == "error": + print("") + try: + input_thetas = input("model isn't trained, would you like to train it before predicting car price ? y/n\n"); + if (input_thetas == 'y'): + raw_data = np.genfromtxt('data.csv', delimiter=',', skip_header=1) + train(raw_data, np.zeros(2), 'thetas.csv', False); + thetas = np.genfromtxt('thetas.csv', delimiter=',') + else: + print ('info: wrong input format, setting thetas to 0') + thetas = [0, 0] + except: + print ('info: wrong input format or fail to train, setting thetas to 0') + thetas = [0, 0] + try: + mileage = int(eval(input("Enter mileage\n"))) + except: + print ('info: input a number') + return + if mileage < 0: + print ('info: mileage should be superior to 0 ! aborting') + return + price = get_price(mileage, thetas) + if price < 0: + print('This car belongs in a museum ! (price inferior to 0)') + else: + print('Predicted car value is ', price,) + +def get_y(x, thetas): + h = 0 + for i in range(len(x)): + h += x[i] * thetas[i] + return h + +def predict(thetas_path): + try: + thetas = np.genfromtxt(thetas_path,delimiter=',') + except: + print('wrong name or format') + return + x = np.empty(len(thetas) - 1) + x[0] = 1; + for i in range(1 ,len(thetas) - 1): + try: + s = "Enter feature " + str(i) + "\n" + x[i] = int(eval(input(s))) + except: + print('input a number') + return + print("y is equal to ", get_y(x, thetas)) + +def main(): + parser = argparse.ArgumentParser(description='DSLR is a 2 day project, if you do Linear Regression in two weeks :pepethefrog:') + parser.add_argument("-p", "--path", type=str, default=False, help="thetas file path") + args = parser.parse_args() + if (args.path == False): + predict_subject() + else: + predict(args.path) + +if __name__ == '__main__': + main() diff --git a/train.py b/train.py new file mode 100644 index 0000000..a9c2e54 --- /dev/null +++ b/train.py @@ -0,0 +1,55 @@ +import argparse +import numpy as np +from LinearRegression import LinearRegression +from matplotlib import animation + +def get_thetas(): + try: + thetas = np.genfromtxt('thetas.csv', delimiter=',') + except: + print ("info: thetas.csv not found, setting thetas to 0") + return [0, 0] + try: + return [thetas[2], thetas[1]] + except: + print ("warning: thetas.csv in wrong format, setting thetas to 0") + return [0, 0] + +def train(raw_data, raw_thetas, thetas_path, visu): + lr = LinearRegression(thetas = raw_thetas, data = raw_data, epochs = 10000, learning_rate = 0.1) + print(lr.raw_data) + print("thetas before train") + print(lr.raw_thetas[:len(lr.raw_thetas) - 1]) + lr.gradient_descent() + print("thetas after train") + print(lr.raw_thetas) + tosave = [] + for i in range(len(lr.raw_thetas)): + tosave.append(lr.raw_thetas[i]) + tosave.append(lr.thetas[0]) + np.savetxt(thetas_path, tosave, delimiter=',') + if (visu and len(lr.thetas) == 2): + lr.show() + +def main(): + parser = argparse.ArgumentParser(description='DSLR is a 2 day project, if you do Linear Regression in two weeks :pepethefrog:') + parser.add_argument("-p", "--path", type=str, default='data.csv', help="data file path") + parser.add_argument("-t", "--thetas", type=str, default=False, help="thetas file path") + parser.add_argument("--visu", default=False, help="plot data on graph", action="store_true") + args = parser.parse_args() + if (args.thetas == False): + args.thetas = 'thetas.csv' + try: + raw_data = np.genfromtxt(args.path, delimiter=',', skip_header=1) + except: + print('csv file not found or wrong') + return + try: + raw_thetas = np.genfromtxt(args.thetas, delimiter=',') + except: + print('thetas file not found or wrong, setting thetas to 0') + raw_thetas = np.zeros(raw_data.shape[1]) + train(raw_data, raw_thetas, args.thetas, args.visu) + +if __name__ == '__main__': + main()