Basic scaffold

Generate, read, means, write all the points
2019-01-23 23:14:35 +01:00 · 2019-01-23 23:14:35 +01:00 · de6a881428
commit de6a881428
parent e763a7611b
7 changed files with 90 additions and 131 deletions
--- a/.gitignore
+++ b/.gitignore
@ -33,3 +33,6 @@ dependency-reduced-pom.xml
 buildNumber.properties
 .mvn/timing.properties
 .mvn/wrapper/maven-wrapper.jar
 # This project
 *.csv
--- a/README.md
+++ b/README.md
@ -0,0 +1,24 @@
 # Usage
 Compile job data
 ```shell
 mvn package
 ```
 Generate vectors to cluster
 ```shell
 ./genVectors.py $DIMENSION $NUMBER > $FILE
 ```
 (example: `./genVectors.py 2 15 > myInput.csv`)
 Run
 ```shell
 flink run target/project-*.jar --input $INPUT --output $OUTPUT
 ```
--- a/genVectors.py
+++ b/genVectors.py
@ -0,0 +1,12 @@
 #!/usr/bin/env python3
 import random
 import sys
 random.seed(0)
 D = int(sys.argv[1]) # Number of dimensions
 N = int(sys.argv[2]) # Number of vectors
 for _ in range(N):
    print(','.join([str(random.random()) for _ in range(D)]))
--- a/pom.xml
+++ b/pom.xml
@ -144,7 +144,7 @@ under the License.
 							</filters>
 							<transformers>
 								<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
-									<mainClass>it.polimi.middleware.projects.flink.StreamingJob</mainClass>
+									<mainClass>it.polimi.middleware.projects.flink.KMeans</mainClass>
 								</transformer>
 							</transformers>
 						</configuration>
--- a/src/main/java/it/polimi/middleware/projects/flink/BatchJob.java
+++ b/src/main/java/it/polimi/middleware/projects/flink/BatchJob.java
@ -1,66 +0,0 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package it.polimi.middleware.projects.flink;
 import org.apache.flink.api.java.ExecutionEnvironment;
 /**
 * Skeleton for a Flink Batch Job.
 *
 * <p>For a tutorial how to write a Flink batch application, check the
 * tutorials and examples on the <a href="http://flink.apache.org/docs/stable/">Flink Website</a>.
 *
 * <p>To package your application into a JAR file for execution,
 * change the main class in the POM.xml file to this class (simply search for 'mainClass')
 * and run 'mvn clean package' on the command line.
 */
 public class BatchJob {
 	public static void main(String[] args) throws Exception {
 		// set up the batch execution environment
 		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
 		/*
 		 * Here, you can start creating your execution plan for Flink.
 		 *
 		 * Start with getting some data from the environment, like
 		 * 	env.readTextFile(textPath);
 		 *
 		 * then, transform the resulting DataSet<String> using operations
 		 * like
 		 * 	.filter()
 		 * 	.flatMap()
 		 * 	.join()
 		 * 	.coGroup()
 		 *
 		 * and many more.
 		 * Have a look at the programming guide for the Java API:
 		 *
 		 * http://flink.apache.org/docs/latest/apis/batch/index.html
 		 *
 		 * and the examples
 		 *
 		 * http://flink.apache.org/docs/latest/apis/batch/examples.html
 		 *
 		 */
 		// execute program
 		env.execute("Flink Batch Java API Skeleton");
 	}
 }
--- a/src/main/java/it/polimi/middleware/projects/flink/KMeans.java
+++ b/src/main/java/it/polimi/middleware/projects/flink/KMeans.java
@ -0,0 +1,50 @@
 package it.polimi.middleware.projects.flink;
 import org.apache.flink.api.common.functions.MapFunction;
 import org.apache.flink.api.common.functions.ReduceFunction;
 import org.apache.flink.api.common.typeinfo.TypeHint;
 import org.apache.flink.api.common.typeinfo.TypeInformation;
 import org.apache.flink.api.java.tuple.Tuple1;
 import org.apache.flink.api.java.tuple.Tuple2;
 import org.apache.flink.api.java.DataSet;
 import org.apache.flink.api.java.ExecutionEnvironment;
 import org.apache.flink.api.java.utils.ParameterTool;
 public class KMeans {
    public static void main(String[] args) throws Exception {
        final ParameterTool params = ParameterTool.fromArgs(args);
        final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        // Read CSV input
        DataSet<Tuple1<Double>> csvInput = env.readCsvFile(params.get("input")).types(Double.class);
        // Convert CSV to internal format
        DataSet<Double> input = csvInput
            .map(point -> point.f0);
        // DEBUG Means all the points
        DataSet<Tuple1<Double>> mean = input
            .map(new MapFunction<Double, Tuple2<Double, Integer>>() {
                public Tuple2<Double, Integer> map(Double value) {
                    return new Tuple2<Double, Integer>(value, 1);
                }
            })
            .reduce(new ReduceFunction<Tuple2<Double, Integer>>() {
                public Tuple2<Double, Integer> reduce(Tuple2<Double, Integer> a, Tuple2<Double, Integer> b) {
                    return new Tuple2<Double, Integer>(a.f0 + b.f0, a.f1 + b.f1);
                }
            })
            .map(new MapFunction<Tuple2<Double, Integer>, Tuple1<Double>>() {
                public Tuple1<Double> map(Tuple2<Double, Integer> value) {
                    return new Tuple1<Double>(value.f0 / value.f1);
                }
            });
        mean.writeAsCsv(params.get("output", "output.csv"));
        env.execute("K-Means clustering");
    }
 }
--- a/src/main/java/it/polimi/middleware/projects/flink/StreamingJob.java
+++ b/src/main/java/it/polimi/middleware/projects/flink/StreamingJob.java
@ -1,64 +0,0 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package it.polimi.middleware.projects.flink;
 import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 /**
 * Skeleton for a Flink Streaming Job.
 *
 * <p>For a tutorial how to write a Flink streaming application, check the
 * tutorials and examples on the <a href="http://flink.apache.org/docs/stable/">Flink Website</a>.
 *
 * <p>To package your application into a JAR file for execution, run
 * 'mvn clean package' on the command line.
 *
 * <p>If you change the name of the main class (with the public static void main(String[] args))
 * method, change the respective entry in the POM.xml file (simply search for 'mainClass').
 */
 public class StreamingJob {
 	public static void main(String[] args) throws Exception {
 		// set up the streaming execution environment
 		final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
 		/*
 		 * Here, you can start creating your execution plan for Flink.
 		 *
 		 * Start with getting some data from the environment, like
 		 * 	env.readTextFile(textPath);
 		 *
 		 * then, transform the resulting DataStream<String> using operations
 		 * like
 		 * 	.filter()
 		 * 	.flatMap()
 		 * 	.join()
 		 * 	.coGroup()
 		 *
 		 * and many more.
 		 * Have a look at the programming guide for the Java API:
 		 *
 		 * http://flink.apache.org/docs/latest/apis/streaming/index.html
 		 *
 		 */
 		// execute program
 		env.execute("Flink Streaming Java API Skeleton");
 	}
 }