Basic scaffold
Generate, read, means, write all the points
This commit is contained in:
parent
e763a7611b
commit
de6a881428
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -33,3 +33,6 @@ dependency-reduced-pom.xml
|
||||||
buildNumber.properties
|
buildNumber.properties
|
||||||
.mvn/timing.properties
|
.mvn/timing.properties
|
||||||
.mvn/wrapper/maven-wrapper.jar
|
.mvn/wrapper/maven-wrapper.jar
|
||||||
|
|
||||||
|
# This project
|
||||||
|
*.csv
|
||||||
|
|
24
README.md
Normal file
24
README.md
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
|
||||||
|
# Usage
|
||||||
|
|
||||||
|
Compile job data
|
||||||
|
|
||||||
|
```shell
|
||||||
|
mvn package
|
||||||
|
```
|
||||||
|
|
||||||
|
Generate vectors to cluster
|
||||||
|
|
||||||
|
```shell
|
||||||
|
./genVectors.py $DIMENSION $NUMBER > $FILE
|
||||||
|
```
|
||||||
|
|
||||||
|
(example: `./genVectors.py 2 15 > myInput.csv`)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Run
|
||||||
|
|
||||||
|
```shell
|
||||||
|
flink run target/project-*.jar --input $INPUT --output $OUTPUT
|
||||||
|
```
|
12
genVectors.py
Executable file
12
genVectors.py
Executable file
|
@ -0,0 +1,12 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
|
|
||||||
|
random.seed(0)
|
||||||
|
|
||||||
|
D = int(sys.argv[1]) # Number of dimensions
|
||||||
|
N = int(sys.argv[2]) # Number of vectors
|
||||||
|
|
||||||
|
for _ in range(N):
|
||||||
|
print(','.join([str(random.random()) for _ in range(D)]))
|
2
pom.xml
2
pom.xml
|
@ -144,7 +144,7 @@ under the License.
|
||||||
</filters>
|
</filters>
|
||||||
<transformers>
|
<transformers>
|
||||||
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
|
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
|
||||||
<mainClass>it.polimi.middleware.projects.flink.StreamingJob</mainClass>
|
<mainClass>it.polimi.middleware.projects.flink.KMeans</mainClass>
|
||||||
</transformer>
|
</transformer>
|
||||||
</transformers>
|
</transformers>
|
||||||
</configuration>
|
</configuration>
|
||||||
|
|
|
@ -1,66 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one
|
|
||||||
* or more contributor license agreements. See the NOTICE file
|
|
||||||
* distributed with this work for additional information
|
|
||||||
* regarding copyright ownership. The ASF licenses this file
|
|
||||||
* to you under the Apache License, Version 2.0 (the
|
|
||||||
* "License"); you may not use this file except in compliance
|
|
||||||
* with the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package it.polimi.middleware.projects.flink;
|
|
||||||
|
|
||||||
import org.apache.flink.api.java.ExecutionEnvironment;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Skeleton for a Flink Batch Job.
|
|
||||||
*
|
|
||||||
* <p>For a tutorial how to write a Flink batch application, check the
|
|
||||||
* tutorials and examples on the <a href="http://flink.apache.org/docs/stable/">Flink Website</a>.
|
|
||||||
*
|
|
||||||
* <p>To package your application into a JAR file for execution,
|
|
||||||
* change the main class in the POM.xml file to this class (simply search for 'mainClass')
|
|
||||||
* and run 'mvn clean package' on the command line.
|
|
||||||
*/
|
|
||||||
public class BatchJob {
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
// set up the batch execution environment
|
|
||||||
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Here, you can start creating your execution plan for Flink.
|
|
||||||
*
|
|
||||||
* Start with getting some data from the environment, like
|
|
||||||
* env.readTextFile(textPath);
|
|
||||||
*
|
|
||||||
* then, transform the resulting DataSet<String> using operations
|
|
||||||
* like
|
|
||||||
* .filter()
|
|
||||||
* .flatMap()
|
|
||||||
* .join()
|
|
||||||
* .coGroup()
|
|
||||||
*
|
|
||||||
* and many more.
|
|
||||||
* Have a look at the programming guide for the Java API:
|
|
||||||
*
|
|
||||||
* http://flink.apache.org/docs/latest/apis/batch/index.html
|
|
||||||
*
|
|
||||||
* and the examples
|
|
||||||
*
|
|
||||||
* http://flink.apache.org/docs/latest/apis/batch/examples.html
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
// execute program
|
|
||||||
env.execute("Flink Batch Java API Skeleton");
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -0,0 +1,50 @@
|
||||||
|
package it.polimi.middleware.projects.flink;
|
||||||
|
|
||||||
|
import org.apache.flink.api.common.functions.MapFunction;
|
||||||
|
import org.apache.flink.api.common.functions.ReduceFunction;
|
||||||
|
import org.apache.flink.api.common.typeinfo.TypeHint;
|
||||||
|
import org.apache.flink.api.common.typeinfo.TypeInformation;
|
||||||
|
import org.apache.flink.api.java.tuple.Tuple1;
|
||||||
|
import org.apache.flink.api.java.tuple.Tuple2;
|
||||||
|
|
||||||
|
import org.apache.flink.api.java.DataSet;
|
||||||
|
import org.apache.flink.api.java.ExecutionEnvironment;
|
||||||
|
import org.apache.flink.api.java.utils.ParameterTool;
|
||||||
|
|
||||||
|
|
||||||
|
public class KMeans {
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
final ParameterTool params = ParameterTool.fromArgs(args);
|
||||||
|
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
|
||||||
|
|
||||||
|
// Read CSV input
|
||||||
|
DataSet<Tuple1<Double>> csvInput = env.readCsvFile(params.get("input")).types(Double.class);
|
||||||
|
|
||||||
|
// Convert CSV to internal format
|
||||||
|
DataSet<Double> input = csvInput
|
||||||
|
.map(point -> point.f0);
|
||||||
|
|
||||||
|
// DEBUG Means all the points
|
||||||
|
DataSet<Tuple1<Double>> mean = input
|
||||||
|
.map(new MapFunction<Double, Tuple2<Double, Integer>>() {
|
||||||
|
public Tuple2<Double, Integer> map(Double value) {
|
||||||
|
return new Tuple2<Double, Integer>(value, 1);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.reduce(new ReduceFunction<Tuple2<Double, Integer>>() {
|
||||||
|
public Tuple2<Double, Integer> reduce(Tuple2<Double, Integer> a, Tuple2<Double, Integer> b) {
|
||||||
|
return new Tuple2<Double, Integer>(a.f0 + b.f0, a.f1 + b.f1);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.map(new MapFunction<Tuple2<Double, Integer>, Tuple1<Double>>() {
|
||||||
|
public Tuple1<Double> map(Tuple2<Double, Integer> value) {
|
||||||
|
return new Tuple1<Double>(value.f0 / value.f1);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
mean.writeAsCsv(params.get("output", "output.csv"));
|
||||||
|
|
||||||
|
env.execute("K-Means clustering");
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,64 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one
|
|
||||||
* or more contributor license agreements. See the NOTICE file
|
|
||||||
* distributed with this work for additional information
|
|
||||||
* regarding copyright ownership. The ASF licenses this file
|
|
||||||
* to you under the Apache License, Version 2.0 (the
|
|
||||||
* "License"); you may not use this file except in compliance
|
|
||||||
* with the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package it.polimi.middleware.projects.flink;
|
|
||||||
|
|
||||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Skeleton for a Flink Streaming Job.
|
|
||||||
*
|
|
||||||
* <p>For a tutorial how to write a Flink streaming application, check the
|
|
||||||
* tutorials and examples on the <a href="http://flink.apache.org/docs/stable/">Flink Website</a>.
|
|
||||||
*
|
|
||||||
* <p>To package your application into a JAR file for execution, run
|
|
||||||
* 'mvn clean package' on the command line.
|
|
||||||
*
|
|
||||||
* <p>If you change the name of the main class (with the public static void main(String[] args))
|
|
||||||
* method, change the respective entry in the POM.xml file (simply search for 'mainClass').
|
|
||||||
*/
|
|
||||||
public class StreamingJob {
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
// set up the streaming execution environment
|
|
||||||
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Here, you can start creating your execution plan for Flink.
|
|
||||||
*
|
|
||||||
* Start with getting some data from the environment, like
|
|
||||||
* env.readTextFile(textPath);
|
|
||||||
*
|
|
||||||
* then, transform the resulting DataStream<String> using operations
|
|
||||||
* like
|
|
||||||
* .filter()
|
|
||||||
* .flatMap()
|
|
||||||
* .join()
|
|
||||||
* .coGroup()
|
|
||||||
*
|
|
||||||
* and many more.
|
|
||||||
* Have a look at the programming guide for the Java API:
|
|
||||||
*
|
|
||||||
* http://flink.apache.org/docs/latest/apis/streaming/index.html
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
// execute program
|
|
||||||
env.execute("Flink Streaming Java API Skeleton");
|
|
||||||
}
|
|
||||||
}
|
|
Reference in a new issue