Documentation

This commit is contained in:
Geoffrey Frogeye 2019-01-24 20:42:41 +01:00
parent a9a0e518e8
commit 24e85deb17
3 changed files with 54 additions and 30 deletions

View file

@ -1,24 +1,53 @@
# K-Means clustering algorithm using Apache Flink
Project for the Middleware Technologies for Distributed Systems.
## Note
- Only supports 2 dimensions points as input data
- Non-deterministic. Only one starting point set is tried.
- Case where a mean cannot be updated: it is discarded (the value of K asked is not the one in the results)
# Usage # Usage
Compile job data ## Compile job package
You need Java ≥ 8 and Maven ≥ 3.1.
```shell ```shell
mvn package mvn package
``` ```
Generate vectors to cluster ## Generate random vectors to cluster (optional)
You need Python 3.
```shell ```shell
./genVectors.py $DIMENSION $NUMBER > $FILE ./genVectors.py $DIMENSION $NUMBER > $FILE
``` ```
(example: `./genVectors.py 2 15 > myInput.csv`) (example: `./genVectors.py 2 1000 > input.csv`)
## Classify
Run You need a running Apache Flink cluster
Input data is a point per line, in the folowing format: `xCoords,yCoords`.
Output data is a point per line, in the folowing format: `xCoords,yCoords,clusterIndex`.
```shell ```shell
flink run target/project-*.jar --input $INPUT --output $OUTPUT flink run target/project-*.jar --input $INPUT --output $OUTPUT [--k $K] [--maxIterations $ITERATIONS]
``` ```
(example: `flink run target/project-1.0.jar --input $PWD/input.csv --output $PWD/output.csv --k 5`)
## Show results
You need Python 3, NumPy, Matplotlib.
```shell
./plotClassification.py $FILE
```
(example: `./plotClassification.py output.csv`)

View file

@ -22,7 +22,7 @@ under the License.
<groupId>it.polimi.middleware.projects.flink</groupId> <groupId>it.polimi.middleware.projects.flink</groupId>
<artifactId>project</artifactId> <artifactId>project</artifactId>
<version>1.0-SNAPSHOT</version> <version>1.0</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<name>Flink Quickstart Job</name> <name>Flink Quickstart Job</name>

View file

@ -34,9 +34,9 @@ import org.apache.flink.api.java.utils.ParameterTool;
public class KMeans { public class KMeans {
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
final ParameterTool params = ParameterTool.fromArgs(args);
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
final ParameterTool params = ParameterTool.fromArgs(args);
final Integer k = params.getInt("k", 3); final Integer k = params.getInt("k", 3);
final Integer maxIterations = params.getInt("maxIterations", 25); final Integer maxIterations = params.getInt("maxIterations", 25);
@ -49,28 +49,18 @@ public class KMeans {
// Find min and max of the coordinates to determine where the initial centroids should be // Find min and max of the coordinates to determine where the initial centroids should be
DataSet<Tuple4<Double, Double, Double, Double>> area = input DataSet<Tuple4<Double, Double, Double, Double>> area = input
.map(new MapFunction<Point, Tuple4<Double, Double, Double, Double>>() { .map(new MapFunction<Point, Tuple4<Double, Double, Double, Double>>() { // Format points so
// they can be passed as reduce parameters
@Override @Override
public Tuple4<Double, Double, Double, Double> map(Point point) { public Tuple4<Double, Double, Double, Double> map(Point point) {
return new Tuple4<Double, Double, Double, Double>(point.x, point.y, point.x, point.y); return new Tuple4<Double, Double, Double, Double>(point.x, point.y, point.x, point.y);
} }
}).reduce(new FindArea()); }).reduce(new FindArea()); // Gives the minX, minY, maxX, maxY of all the point
area.print();
DataSet<Tuple2<Double, Double>> testCentroids = area
.flatMap(new RandomCentroids(k))
.map(new MapFunction<Point, Tuple2<Double, Double>>() {
@Override
public Tuple2<Double, Double> map(Point point) {
return new Tuple2<Double, Double>(point.x, point.y);
}});
testCentroids.print();
// Generate random centroids // Generate random centroids
IterativeDataSet<Point> centroids = area IterativeDataSet<Point> centroids = area
.flatMap(new RandomCentroids(k)) .flatMap(new RandomCentroids(k)) // Create centroids randomly in the area of the points
.iterate(maxIterations); .iterate(maxIterations); // Mark beginning of the loop
// Assign points to centroids // Assign points to centroids
DataSet<Tuple2<Point, Integer>> assigned = input DataSet<Tuple2<Point, Integer>> assigned = input
@ -78,18 +68,18 @@ public class KMeans {
// Calculate means // Calculate means
DataSet<Point> newCentroids = assigned DataSet<Point> newCentroids = assigned
.map(new MeanPrepare()) .map(new MeanPrepare()) // Add Integer field to tuple to count the points
.groupBy(1) // GroupBy CentroidID .groupBy(1) // GroupBy CentroidID
.reduce(new MeanSum()) .reduce(new MeanSum()) // Sum every points by centroid
.map(new MeanDivide()); .map(new MeanDivide()); // Divide by the number of points to get the average
DataSet<Point> finalCentroids = centroids.closeWith(newCentroids); DataSet<Point> finalCentroids = centroids.closeWith(newCentroids); // Mark end of the loop
// Final assignment of points to centroids // Final assignment of points to centroids (that's the data we want)
assigned = input assigned = input
.map(new AssignCentroid()).withBroadcastSet(finalCentroids, "centroids"); .map(new AssignCentroid()).withBroadcastSet(finalCentroids, "centroids");
// Convert to external format // Convert to CSV format
DataSet<Tuple3<Double, Double, Integer>> output = assigned DataSet<Tuple3<Double, Double, Integer>> output = assigned
.map(new MapFunction<Tuple2<Point, Integer>, Tuple3<Double, Double, Integer>>() { .map(new MapFunction<Tuple2<Point, Integer>, Tuple3<Double, Double, Integer>>() {
@Override @Override
@ -128,6 +118,7 @@ public class KMeans {
} }
public Point divideBy(Integer factor) { public Point divideBy(Integer factor) {
// Since input is always re-fetched we can overwrite the values
x /= factor; x /= factor;
y /= factor; y /= factor;
return this; return this;
@ -148,12 +139,13 @@ public class KMeans {
} }
public static class RandomCentroids implements FlatMapFunction<Tuple4<Double, Double, Double, Double>, Point> { public static class RandomCentroids implements FlatMapFunction<Tuple4<Double, Double, Double, Double>, Point> {
// minX, minY, maxX, maxY Point × k
Integer k; Integer k;
Random r; Random r;
public RandomCentroids(Integer k) { public RandomCentroids(Integer k) {
this.k = k; this.k = k;
this.r = new Random(0); this.r = new Random();
} }
private Double randomRange(Double min, Double max) { private Double randomRange(Double min, Double max) {
@ -174,16 +166,19 @@ public class KMeans {
@Override @Override
public void open(Configuration parameters) throws Exception { public void open(Configuration parameters) throws Exception {
// Centroids are sorted so they have an identifier common to all the operators
centroids = new ArrayList(getRuntimeContext().getBroadcastVariable("centroids")); centroids = new ArrayList(getRuntimeContext().getBroadcastVariable("centroids"));
Collections.sort(centroids); Collections.sort(centroids);
} }
@Override @Override
public Tuple2<Point, Integer> map(Point point) { public Tuple2<Point, Integer> map(Point point) {
// Calculate the distance Point-Centroid for all centroids,
// keep the identifier of the closest centroid
Integer c; Integer c;
Point centroid; Point centroid;
Double distance; Double distance;
Integer minCentroid = 4; Integer minCentroid = 0;
Double minDistance = Double.POSITIVE_INFINITY; Double minDistance = Double.POSITIVE_INFINITY;
for (c = 0; c < centroids.size(); c++) { for (c = 0; c < centroids.size(); c++) {