Documentation

2019-01-24 20:42:41 +01:00 · 2019-01-24 20:42:41 +01:00 · 24e85deb17
parent a9a0e518e8
commit 24e85deb17
3 changed files with 54 additions and 30 deletions
--- a/README.md
+++ b/README.md
@ -1,24 +1,53 @@
+# K-Means clustering algorithm using Apache Flink
+
+Project for the Middleware Technologies for Distributed Systems.
+
+## Note
+
+- Only supports 2 dimensions points as input data
+- Non-deterministic. Only one starting point set is tried.
+- Case where a mean cannot be updated: it is discarded (the value of K asked is not the one in the results)

 # Usage

-Compile job data
+## Compile job package
+
+You need Java ≥ 8 and Maven ≥ 3.1.

 ```shell
 mvn package
 ```

-Generate vectors to cluster
+## Generate random vectors to cluster (optional)
+
+You need Python 3.

 ```shell
 ./genVectors.py $DIMENSION $NUMBER > $FILE
 ```

-(example: `./genVectors.py 2 15 > myInput.csv`)
+(example: `./genVectors.py 2 1000 > input.csv`)


+## Classify

-Run
+You need a running Apache Flink cluster
+
+Input data is a point per line, in the folowing format: `xCoords,yCoords`.
+Output data is a point per line, in the folowing format: `xCoords,yCoords,clusterIndex`.

 ```shell
-flink run target/project-*.jar --input $INPUT --output $OUTPUT
+flink run target/project-*.jar --input $INPUT --output $OUTPUT [--k $K] [--maxIterations $ITERATIONS]
 ```
+
+(example: `flink run target/project-1.0.jar --input $PWD/input.csv --output $PWD/output.csv --k 5`)
+
+## Show results
+
+You need Python 3, NumPy, Matplotlib.
+
+```shell
+./plotClassification.py $FILE
+```
+
+(example: `./plotClassification.py output.csv`)
--- a/pom.xml
+++ b/pom.xml
@ -22,7 +22,7 @@ under the License.

 	<groupId>it.polimi.middleware.projects.flink</groupId>
 	<artifactId>project</artifactId>
-	<version>1.0-SNAPSHOT</version>
+	<version>1.0</version>
 	<packaging>jar</packaging>

 	<name>Flink Quickstart Job</name>
--- a/src/main/java/it/polimi/middleware/projects/flink/KMeans.java
+++ b/src/main/java/it/polimi/middleware/projects/flink/KMeans.java
@ -34,9 +34,9 @@ import org.apache.flink.api.java.utils.ParameterTool;
 public class KMeans {

    public static void main(String[] args) throws Exception {
-        final ParameterTool params = ParameterTool.fromArgs(args);
        final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

+        final ParameterTool params = ParameterTool.fromArgs(args);
        final Integer k = params.getInt("k", 3);
        final Integer maxIterations = params.getInt("maxIterations", 25);

@ -49,28 +49,18 @@ public class KMeans {

        // Find min and max of the coordinates to determine where the initial centroids should be
        DataSet<Tuple4<Double, Double, Double, Double>> area = input
-            .map(new MapFunction<Point, Tuple4<Double, Double, Double, Double>>() {
+            .map(new MapFunction<Point, Tuple4<Double, Double, Double, Double>>() { // Format points so
+                // they can be passed as reduce parameters
                @Override
                public Tuple4<Double, Double, Double, Double> map(Point point) {
                    return new Tuple4<Double, Double, Double, Double>(point.x, point.y, point.x, point.y);
                }
-            }).reduce(new FindArea());
-
-        area.print();
-
-        DataSet<Tuple2<Double, Double>> testCentroids = area
-            .flatMap(new RandomCentroids(k))
-            .map(new MapFunction<Point, Tuple2<Double, Double>>() {
-                @Override
-                public Tuple2<Double, Double> map(Point point) {
-                    return new Tuple2<Double, Double>(point.x, point.y);
-                }});
-        testCentroids.print();
+            }).reduce(new FindArea()); // Gives the minX, minY, maxX, maxY of all the point

        // Generate random centroids
        IterativeDataSet<Point> centroids = area
-            .flatMap(new RandomCentroids(k))
-            .iterate(maxIterations);
+            .flatMap(new RandomCentroids(k)) // Create centroids randomly in the area of the points
+            .iterate(maxIterations); // Mark beginning of the loop

        // Assign points to centroids
        DataSet<Tuple2<Point, Integer>> assigned = input
@ -78,18 +68,18 @@ public class KMeans {

        // Calculate means
        DataSet<Point> newCentroids = assigned
-            .map(new MeanPrepare())
+            .map(new MeanPrepare()) // Add Integer field to tuple to count the points
            .groupBy(1) // GroupBy CentroidID
-            .reduce(new MeanSum())
-            .map(new MeanDivide());
+            .reduce(new MeanSum()) // Sum every points by centroid
+            .map(new MeanDivide()); // Divide by the number of points to get the average

-        DataSet<Point> finalCentroids = centroids.closeWith(newCentroids);
+        DataSet<Point> finalCentroids = centroids.closeWith(newCentroids); // Mark end of the loop

-        // Final assignment of points to centroids
+        // Final assignment of points to centroids (that's the data we want)
        assigned = input
            .map(new AssignCentroid()).withBroadcastSet(finalCentroids, "centroids");

-        // Convert to external format
+        // Convert to CSV format
        DataSet<Tuple3<Double, Double, Integer>> output = assigned
            .map(new MapFunction<Tuple2<Point, Integer>, Tuple3<Double, Double, Integer>>() {
                @Override
@ -128,6 +118,7 @@ public class KMeans {
        }

        public Point divideBy(Integer factor) {
+            // Since input is always re-fetched we can overwrite the values
            x /= factor;
            y /= factor;
            return this;
@ -148,12 +139,13 @@ public class KMeans {
    }

    public static class RandomCentroids implements FlatMapFunction<Tuple4<Double, Double, Double, Double>, Point> {
+        // minX, minY, maxX, maxY → Point × k
        Integer k;
        Random r;

        public RandomCentroids(Integer k) {
            this.k = k;
-            this.r = new Random(0);
+            this.r = new Random();
        }

        private Double randomRange(Double min, Double max) {
@ -174,16 +166,19 @@ public class KMeans {

        @Override
        public void open(Configuration parameters) throws Exception {
+            // Centroids are sorted so they have an identifier common to all the operators
            centroids = new ArrayList(getRuntimeContext().getBroadcastVariable("centroids"));
            Collections.sort(centroids);
        }

        @Override
        public Tuple2<Point, Integer> map(Point point) {
+            // Calculate the distance Point-Centroid for all centroids,
+            // keep the identifier of the closest centroid
            Integer c;
            Point centroid;
            Double distance;
-            Integer minCentroid = 4;
+            Integer minCentroid = 0;
            Double minDistance = Double.POSITIVE_INFINITY;

            for (c = 0; c < centroids.size(); c++) {