Data Preparation

Proses persiapan data merupakan tahap awal dalam proses data mining maupun machine learning. Hasil yang baik merupakan penggabungan antara persiapan data yang baik dan proses pemilihan teknik yang tepat.

Advertisements

Java: K-means Clustering

Proses klaster merupakan sebuah metode yang digunakan untuk menempatkan data ke dalam kelompok dalam jumlah tertentu.

import java.util.Random;

public class KMeans {
	public static void main(String args[]) {
		double[][] data = {
			{-7.59, 128.66},
			{-8.8, 112.53},
			{1.45, 128.83},
			{-5.69, 128.85},
			{-9.42, 107.02},
			{-6.18, 132.68},
			{2.12, 96.53},
			{-8.03, 110.39},
			{-8.41, 120.69},
			{-5.61, 133.38},
			{-0.98, 124.68},
			{-6.61, 130.28},
			{-5.34, 102.94},
			{-0.3, 132.94},
			{-3.32, 127.41},
			{2.49, 95.95},
			{-8.43, 121.83},
			{-0.21, 124.69},
			{-3.76, 101.81},
			{-2.61, 138.42},
			{-2.32, 138.52},
			{3.68, 126.06},
			{-0.59, 127.47},
			{-4.34, 103.44},
			{-7, 103.33},
			{-6.08, 125.88},
			{-1.06, 129.2},
			{-1.11, 129.24},
			{-8.1, 110.37},
			{-6.83, 129.6},
			{-0.48, 132.87},
			{3.76, 126.79},
			{-5.34, 133.95},
			{-9.08, 118.76},
			{1.88, 96.8},
			{5.07, 94.35},
			{-4.94, 133.9},
			{-4.66, 134.18},
			{-4.48, 133.86},
			{-7.42, 129.22},
			{-7.27, 123.15},
			{-4.83, 133.74},
			{-5.13, 133.92},
			{-4.93, 133.93},
			{-0.46, 133.46},
			{-4.81, 133.72},
			{4.93, 94.86},
			{-11.23, 113.66},
			{-4.05, 128.75},
			{-6.74, 128.82},
			{-0.86, 120.42},
			{-6.83, 104.86},
			{-5.76, 127.49},
			{2.7, 128.03},
			{-4.85, 133.83},
			{-5.67, 101.76},
			{-7.88, 106.41},
			{3.75, 94.91},
			{-7.29, 125.67},
			{4.66, 95.14},
			{-5.26, 131.53},
			{-3.43, 100.2},
			{-3.42, 99.39},
			{-3.26, 100.26},
			{-3.7, 99.78},
			{-2.59, 99.65},
			{-3.74, 99.62},
			{-3.88, 99.31},
			{-2.51, 99.62},
			{-3.74, 99.32},
			{-3.69, 99.3},
			{-3.45, 99.93},
			{-3.71, 99.34},
			{-6.48, 130.13},
			{2.82, 128.32},
			{-3.45, 99.42},
			{-3.77, 99.15},
			{-3.59, 99.74},
			{-6.19, 130.64},
			{-6.68, 103.78},
			{-4.57, 134.09},
			{2.49, 126.35},
			{-7.67, 128.53},
			{-7.97, 104.67},
			{-3.74, 99.37},
			{-3.85, 99.39},
			{-8.98, 110.08},
			{-9.37, 119.32},
			{-1.71, 99.31},
			{-8.15, 107.15},
			{0.55, 126.18},
			{1.54, 126.5},
			{-6.66, 130.18},
			{-6.67, 130.64},
			{-3.06, 101.26},
			{-1.9, 139.08},
			{-5.92, 130.87},
			{-8.06, 129.7},
			{1.03, 99.96},
			{-3.42, 100.01},
			{-7.47, 106.04},
			{-5.08, 102.79},
			{-6.21, 130.8},
			{-2.07, 136.48},
			{-3.28, 131.06},
			{-2.53, 129.19},
			{-6.54, 131.51},
			{3.79, 128.04},
			{3.8, 127.84},
			{-6.6, 130.73},
			{-8.41, 106.82},
			{2.62, 98.92},
			{-0.69, 133.07},
			{-6.37, 130.64},
			{-9.21, 127.07},
			{-3.88, 100.02},
			{-11.04, 107.35},
			{-0.21, 123.89},
			{-6.78, 130.04},
			{-7.38, 125.08},
			{-1.48, 132.06},
			{-9.92, 109.54},
			{-6.11, 117.55},
			{-6.6, 103.74},
			{-8.39, 119.74},
			{3.73, 95.95},
			{-7.64, 129.1},
			{-6.54, 130.17},
			{5.7, 93.56},
			{-9.08, 111.19},
			{0.3, 122.22},
			{2.6, 95.78},
			{-0.83, 121.74},
			{3.77, 95.94},
			{-8.19, 119.95},
			{-2.27, 137.99},
			{-4.68, 101.16},
			{-10.47, 113.33},
			{-2.61, 99.95},
			{4, 90.37},
			{0.05, 125},
			{0.05, 125},
			{-0.85, 119.78},
			{0.79, 123.32},
			{-5.21, 130.13},
			{3.63, 126.84},
			{-7.1, 105.15},
			{-5.23, 130.25},
			{-0.94, 124.19},
			{-7.16, 129.31}
		};

		// Jumlah Klaster
		int k = 3;

		double[][] random = getRandom(data, k);
		double[] cluster = getEuclideanDistance(data, random, k);

		for (int i = 0; i < data.length; i++) {
			for (int j = 0; j < data[0].length; j++) {
				System.out.print(data[i][j] + "\t");
			}
			System.out.print(cluster[i]);
			System.out.print("\n");
		}
	}

	public static double[] getEuclideanDistance(double[][] data, double[][] random, int k) {
		double[][] cluster = new double[data.length][k];
		double[] min = new double[data.length];
		double[] clusterClass = new double[data.length];
		double dist = 0;

		for (int a = 0; a < k; a++) {
			for (int b = 0; b < data.length; b++) {
				for (int c = 0; c < data[0].length; c++) {
					dist += Math.pow((data[b][c] - random[a][c]), 2);
				}
				cluster[b][a] = Math.sqrt(dist);
				dist = 0;
			}
		}

		for (int i = 0; i < cluster.length; i++) {
			min[i] = cluster[i][0];
			for (int j = 0; j < cluster[0].length; j++) {
				if (cluster[i][j] < min[i]) {
					min[i] = cluster[i][j];
					clusterClass[i] = j;
				}
			}
		}
		return clusterClass;
	}

	public static double[][] getRandom(double[][] data, int k) {
		double[][] random = new double[k][data[0].length];

		for (int a = 0; a < k; a++) {
			int rand = new Random().nextInt(data.length);
			for (int b = 0; b < data[0].length; b++) {
				random[a][b] = data[rand][b];
			}
		}
		return random;
	}
}