Streaming a File From HDFS Address in Apache Flink

Streaming a File From HDFS Address in Apache Flink - apache-flink

In my Flink code, I am streaming a file which is located on HDFS folder, I get the error " (No such file or directory)", however I am sure the file name and address is correct as I used the same in the batch methods and every thing worked smoothly.
Does any one know what could be the problem?
Here is my code:
DataStream<FebrlObject> myStream =
env.addSource(new MyObjectGenerator("hdfs://../Data/Dataset1.csv"));
and its related class:
public class MyObjectGenerator implements SourceFunction<MyObject> {
private String dataFilePath;
private float servingSpeedFactor;
private Integer rowNo ;
private transient BufferedReader reader;
private transient InputStream inputStream;
public MyObjectGenerator(String dataFilePath) {
this(dataFilePath, 1.0f);
}
public MyObjectGenerator(String dataFilePath, float servingSpeedFactor) {
this.dataFilePath = dataFilePath;
this.servingSpeedFactor = servingSpeedFactor;
rowNo = 0 ;
}
#Override
public void run(SourceContext<MyObject> sourceContext) throws Exception {
long servingStartTime = Calendar.getInstance().getTimeInMillis();
inputStream = new DataInputStream(new FileInputStream(dataFilePath));
reader = new BufferedReader(new InputStreamReader(inputStream));
String line;
long dataStartTime;
rowNo++;
if (reader.ready() && (line = reader.readLine()) != null ) {
MyObject myObject = MyObject.fromString(line);
if (febrlObject!= null )
sourceContext.collect(myObject);
} else {
return;
}
while (reader.ready() && (line = reader.readLine()) != null) {
MyObject myObject = MyObject.fromString(line);
sourceContext.collect( febrlObject );
}
this.reader.close();
this.reader = null;
this.inputStream.close();
this.inputStream = null;
}
#Override
public void cancel() {
try {
if (this.reader != null) {
this.reader.close();
}
if( this.inputStream != null) {
this.inputStream.close();
}
} catch (IOException ioe) {
//
} finally {
this.reader = null;
this.inputStream = null;
}
}
}

You try to access a file in HDFS with Java's regular FileInputStream. FileInputStream can only access the local file system. It does not know anything about talking to HDFS. You need to use the HDFS client to read files from HDFS. See Flink'sFileInputFormat` as an example.
However, I would try to avoid implementing this yourself if possible. You could try to use Flink's FileInputFormat to read the file line wise (returns a DataStream<String>) and a consecutive (flat) mapper that parses the line.

Related

Reading and writing the file from and to winSCP from the S3 object store

I am trying to put and read file from the remote file system using winSCP through an SFTP connection. The leaf node of the file system is s3 object store which contain the files (for eg: xyz.txt).
Below is the overridden method of File Channel class.
XYZFileSystemProvider
public class XYZFileSystemProvider extends FileSystemProvider {
#Override
public FileChannel newFileChannel(Path path, Set<? extends OpenOption> options, FileAttribute<?>... attrs)
throws IOException {
// TODO Auto-generated method stub
Collection<XYZOptions.OpenMode> modes = XYZOptions.OpenMode.fromOpenOptions(options);
if (modes.isEmpty()) {
modes = EnumSet.of(XYZOptions.OpenMode.Read, MFEOptions.OpenMode.Write);
}
// TODO: process file attributes
return new XYZFileSystemChannel(path, modes);
}
}
XYZFileSystemChannel
public class XYZFileSystemChannel extends XYZRemotePathChannel{
public XYZFileSystemChannel(XYZPath p, Collection<XYZOptions.OpenMode> modes) throws IOException {
this(Objects.requireNonNull(p, "No target path").toString(), p.getFileSystem(), modes);
}
public XYZFileSystemChannel(String remotePath, XYZFileSystem fs, Collection<XYZOptions.OpenMode> modes) throws IOException {
super(remotePath, fs, true, modes);
}
}
XYZRemotePathChannel
public class XYZRemotePathChannel extends FileChannel {
private AmazonS3Component getAmazonS3Instance() {
return SpringContext.getBean(AmazonS3Component.class);
}
private final String path;
private final Collection<XYZOptions.OpenMode> modes;
private final boolean closeOnExit;
private XYZFileSystem fileSystem;
private final AtomicLong posTracker = new AtomicLong(0L);
public static final Set<XYZOptions.OpenMode> READ_MODES =
Collections.unmodifiableSet(EnumSet.of(XYZOptions.OpenMode.Read));
private final Object lock = new Object();
private final AtomicReference<Thread> blockingThreadHolder = new AtomicReference<>(null);
public XYZRemotePathChannel(String path, XYZFileSystem fileSystem, boolean closeOnExit,
Collection<XYZOptions.OpenMode> modes) throws IOException {
this.path = ValidateUtils.checkNotNullAndNotEmpty(path, "No remote file path specified");
this.modes = Objects.requireNonNull(modes, "No channel modes specified");
this.closeOnExit = closeOnExit;
this.fileSystem = fileSystem;
}
#Override
public int read(ByteBuffer dst) throws IOException {
// TODO Auto-generated method stub
log.debug("Position of dst is : {}",dst.position());
log.debug("Reading the bytes of the file : {}", dst);
//Some code to be done here in order to read dst and send bytes of the file recieved from s3 store
return (int) doRead(Collections.singletonList(dst), -1);
}
protected long doRead(List<ByteBuffer> buffers, long position) throws IOException {
log.debug("Do Reading the bytes of the file of list of buffer : {} and position :{}", buffers , position);
ensureOpen(READ_MODES);
synchronized (lock) {
boolean completed = false;
boolean eof = false;
long curPos = (position >= 0L) ? position : posTracker.get();
byte[] bytes = new byte[(int) curPos];
try {
long totalRead = 0;
beginBlocking();
String [] parts = this.path.toString().replaceFirst("^/", "").split("/");
String bucket = parts[parts.length-2];
String fileName = parts[parts.length-1];
InputStream fileContent = getAmazonS3Instance().getFileFromBucket(bucket, fileName);
log.debug("Contens of the file: {} from bucket: {} are : {}", fileName , bucket, fileContent);
//Some code to be done here to return the content byte length??
int fileLenght = fileContent.read(bytes, 1, (int) curPos);
log.debug("After reading the file content the file length is : {}" , fileLenght );
return fileLenght;
} finally {
if (position < 0L) {
posTracker.set(curPos);
}
endBlocking(completed);
}
}
}
private void endBlocking(boolean completed) throws AsynchronousCloseException {
blockingThreadHolder.set(null);
end(completed);
}
private void beginBlocking() {
begin();
blockingThreadHolder.set(Thread.currentThread());
}
#Override
public FileChannel position(long newPosition) throws IOException {
// TODO Auto-generated method stub
log.debug("Setting the position of the file : {}", newPosition);
if (newPosition < 0L) {
throw new IllegalArgumentException("position(" + this.path + ") illegal file channel position: " + newPosition);
}
ensureOpen(Collections.emptySet());
posTracker.set(newPosition);
return this;
}
private void ensureOpen(Collection<XYZOptions.OpenMode> reqModes) throws IOException {
if (!isOpen()) {
throw new ClosedChannelException();
}
if (GenericUtils.size(reqModes) > 0) {
for (XYZOptions.OpenMode m : reqModes) {
if (this.modes.contains(m)) {
return;
}
}
throw new IOException("ensureOpen(" + this.path + ") current channel modes (" + this.modes
+ ") do contain any of the required: " + reqModes);
}
}
}
XYZOptions
public class XYZOptions {
enum OpenMode {
Read, Write, Append, Create, Truncate, Exclusive;
public static final Set<OpenOption> SUPPORTED_OPTIONS = Collections
.unmodifiableSet(EnumSet.of(StandardOpenOption.READ, StandardOpenOption.APPEND,
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE,
StandardOpenOption.CREATE_NEW, StandardOpenOption.SPARSE));
public static Set<OpenMode> fromOpenOptions(Collection<? extends OpenOption> options) {
if (GenericUtils.isEmpty(options)) {
return Collections.emptySet();
}
Set<OpenMode> modes = EnumSet.noneOf(OpenMode.class);
for (OpenOption option : options) {
if (option == StandardOpenOption.READ) {
modes.add(Read);
} else if (option == StandardOpenOption.APPEND) {
modes.add(Append);
} else if (option == StandardOpenOption.CREATE) {
modes.add(Create);
} else if (option == StandardOpenOption.TRUNCATE_EXISTING) {
modes.add(Truncate);
} else if (option == StandardOpenOption.WRITE) {
modes.add(Write);
} else if (option == StandardOpenOption.CREATE_NEW) {
modes.add(Create);
modes.add(Exclusive);
} else if (option == StandardOpenOption.SPARSE) {
continue;
} else {
throw new IllegalArgumentException("Unsupported open option: " + option);
}
}
return modes;
}
}
}
I am able to fetch the file from the s3 store but nor sure how to read and pass all the contents while someone drag and drop from remote file location to their own system using winSCP. I know i am missing some code at the mentioned place but not sure how to achieve it.

Download accelerator causes org.apache.catalina.connector.ClientAbortException: java.io.IOException when providing download from backing bean

i use JSF and want to have file download in my page . i wrote some codes but i get ClientAbortException error when i use some download manager for download my file :
public class FileUtil {
public static FacesContext getContext() {
return FacesContext.getCurrentInstance();
}
public static void sendFile(File file, boolean attachment) throws IOException {
sendFile(getContext(), file, attachment);
}
public static void sendFile(FacesContext context, File file, boolean attachment) throws IOException {
sendFile(context, new FileInputStream(file), file.getName(), file.length(), attachment);
}
public static void sendFile(FacesContext context, byte[] content, String filename, boolean attachment) throws IOException {
sendFile(context, new ByteArrayInputStream(content), filename, (long) content.length, attachment);
}
public static void sendFile(FacesContext context, InputStream content, String filename, boolean attachment) throws IOException {
sendFile(context, content, filename, -1L, attachment);
}
private static void sendFile(FacesContext context, InputStream input, String filename, long contentLength, boolean attachment) throws IOException {
ExternalContext externalContext = context.getExternalContext();
externalContext.setResponseBufferSize(10240);
externalContext.setResponseContentType(getMimeType(context, filename));
externalContext.setResponseHeader("Content-Disposition", String.format("%s;filename=\"%2$s\"; filename*=UTF-8\'\'%2$s", new Object[]{attachment ? "attachment" : "inline", encodeURL(filename)}));
if (((HttpServletRequest) externalContext.getRequest()).isSecure()) {
externalContext.setResponseHeader("Cache-Control", "public");
externalContext.setResponseHeader("Pragma", "public");
}
if (contentLength != -1L) {
externalContext.setResponseHeader("Content-Length", String.valueOf(contentLength));
}
long size = stream(input, externalContext.getResponseOutputStream());
if (contentLength == -1L) {
externalContext.setResponseHeader("Content-Length", String.valueOf(size));
}
context.responseComplete();
}
public static String getMimeType(FacesContext context, String name) {
String mimeType = context.getExternalContext().getMimeType(name);
if (mimeType == null) {
mimeType = "application/octet-stream";
}
return mimeType;
}
public static long stream(InputStream input, OutputStream output) throws IOException {
ReadableByteChannel inputChannel = Channels.newChannel(input);
Throwable var3 = null;
try {
WritableByteChannel outputChannel = Channels.newChannel(output);
Throwable var5 = null;
try {
ByteBuffer buffer = ByteBuffer.allocateDirect(10240);
long size = 0L;
while (inputChannel.read(buffer) != -1) {
buffer.flip();
size += (long) outputChannel.write(buffer);
buffer.clear();
}
long var9 = size;
return var9;
} catch (Throwable var33) {
var5 = var33;
throw var33;
} finally {
if (outputChannel != null) {
if (var5 != null) {
try {
outputChannel.close();
} catch (Throwable var32) {
var5.addSuppressed(var32);
}
} else {
outputChannel.close();
}
}
}
} catch (Throwable var35) {
var3 = var35;
throw var35;
} finally {
if (inputChannel != null) {
if (var3 != null) {
try {
inputChannel.close();
} catch (Throwable var31) {
var3.addSuppressed(var31);
}
} else {
inputChannel.close();
}
}
}
}
public static String encodeURL(String string) {
if (string == null) {
return null;
} else {
try {
return URLEncoder.encode(string, StandardCharsets.UTF_8.name());
} catch (UnsupportedEncodingException var2) {
throw new UnsupportedOperationException("UTF-8 is apparently not supported on this platform.", var2);
}
}
}
}
something that i can not understand is when download is done by native chorome download without usage of any download manager like IDM or eagleget , I Do not get any ClientAbortException , but when i use these download manager software for (enable their AddOns) i get these error
what happens ? i know this error happens with some connection losing ... but i did not close my page or any thing that cause this error!
and this is my bean code:
#ManagedBean(name = "bean")
#RequestScoped
public class MB implements Serializable {
public void MBdowan() throws IOException {
File file = new File("E:\\Animation\\IA\\Learning movies\\webinar1\\01_Aug_webinar_08\\Aug08_edited_webinar_animation.mov");
FileUtil.sendFile(file,true);
}
and this is my xhtml page :
</h:head>
<h:body>
<h:form>
<p:commandButton value="Download file" ajax="false" actionListener="#{bean.MBdowan}"/>
</h:form>
</h:body>

Download accelerators (and media players!) expect files which are idempotently available via GET and HEAD requests (i.e. when just typing URL in browser's address bar) and preferably also support HTTP Range requests (so multiple HTTP connections could be opened to download parts simultaneously). The JSF backing bean method is only invoked on a POST request (i.e. when submitting a HTML form with method="post"). The ClientAbortException happens because the download accelerator didn't got the response it expected while sniffing for HEAD and Range support and aborted it.
If those files are static and thus not dynamic, then your best bet is to create a separate servlet which supports HEAD and preferably also HTTP Range requests.
Given that you clearly ripped off the source code from OmniFaces Faces#sendFile(), I'd suggest to rip off the source code of another OmniFaces artifact, the FileServlet. You can find snapshot showcase and source code link here: OmniFaces (2.2) FileServlet.
Here's how you could use it:
#WebServlet("/webinar_animation.mov")
public class YourFileServlet extends FileServlet {
#Override
protected File getFile(HttpServletRequest request) throws IllegalArgumentException {
return new File("E:\\Animation\\IA\\Learning movies\\webinar1\\01_Aug_webinar_08\\Aug08_edited_webinar_animation.mov");
}
}
Download file
See also:
How to stream audio/video files such as MP3, MP4, AVI, etc using a Servlet

Increasing heap by excessive use oft Java ScriptEngine (Jyhton)

We have a JavaEE application that uses jython to execute some python scripts. By and by the used heapspace gets bigger and bigger until there is no more heapspace left. In a heapdump i can se that there are a lot of Py*-classes.
So i wrote a small test-program:
TestApp
public class TestApp {
private final ScriptEngineManager scriptEngineManager = new ScriptEngineManager();
private HashMap<String, ScriptEngine> scriptEngines = new HashMap<String, ScriptEngine>();
private final String scriptContainerPath = "";
public static void main(String[] args) throws InterruptedException {
int counter = 1;
while(true) {
System.out.println("iteration: " + counter);
TestApp testApp = new TestApp();
testApp.execute();
counter++;
Thread.sleep(100);
}
}
void execute() {
File scriptContainer = new File(scriptContainerPath);
File[] scripts = scriptContainer.listFiles();
if (scripts != null && scripts.length > 0) {
Arrays.sort(scripts, new Comparator<File>() {
#Override
public int compare(File file1, File file2) {
return file1.getName().compareTo(file2.getName());
}
});
for (File script : scripts) {
String engineName = ScriptExecutor.getEngineNameByExtension(script.getName());
if(!scriptEngines.containsKey(engineName)) {
scriptEngines.put(engineName, scriptEngineManager.getEngineByName(engineName));
}
ScriptEngine scriptEngine = scriptEngines.get(engineName);
try {
ScriptExecutor scriptExecutor = new ScriptExecutor(scriptEngine, script, null);
Boolean disqualify = scriptExecutor.getBooleanScriptValue("disqualify");
String reason = scriptExecutor.getStringScriptValue("reason");
System.out.println("disqualify: " + disqualify);
System.out.println("reason: " + reason);
} catch (Exception e) {
e.printStackTrace();
}
}
// cleanup
for(Map.Entry<String, ScriptEngine> entry : scriptEngines.entrySet()) {
ScriptEngine engine = entry.getValue();
engine.getContext().setErrorWriter(null);
engine.getContext().setReader(null);
engine.getContext().setWriter(null);
}
}
}
}
ScriptExecutor
public class ScriptExecutor {
private final static String pythonExtension = "py";
private final static String pythonEngine = "python";
private final ScriptEngine scriptEngine;
public ScriptExecutor(ScriptEngine se, File file, Map<String, Object> keyValues) throws FileNotFoundException, ScriptException {
scriptEngine = se;
if (keyValues != null) {
for (Map.Entry<String, Object> entry : keyValues.entrySet()) {
scriptEngine.put(entry.getKey(), entry.getValue());
}
}
// execute script
Reader reader = null;
try {
reader = new FileReader(file);
scriptEngine.eval(reader);
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
// nothing to do
}
}
}
}
public Boolean getBooleanScriptValue(String key) {
// convert Object to Boolean
}
public String getStringScriptValue(String key) {
// convert Object to String
}
public static String getEngineNameByExtension(String fileName) {
String extension = fileName.substring(fileName.lastIndexOf(".") + 1);
if (pythonExtension.equalsIgnoreCase(extension)) {
System.out.println("Found engine " + pythonEngine + " for extension " + extension + ".");
return pythonEngine;
}
throw new RuntimeException("No suitable engine found for extension " + extension);
}
}
In the specified directory are 14 python scripts that all look like this:
disqualify = True
reason = "reason"
I start this program with the following VM-arguments:
-Xrs -Xms16M -Xmx16M -XX:MaxPermSize=32M -XX:NewRatio=3 -Dsun.rmi.dgc.client.gcInterval=300000 -Dsun.rmi.dgc.server.gcInterval=300000 -XX:+UseConcMarkSweepGC -XX:+UseParNewGC -XX:+CMSParallelRemarkEnabled -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -server
These are the arguments our AppServer is running with. Only Xms, Xmx and MaxPermSize are smaller in my testcase.
When I run this application I can see that the CMS Old Gen pool increases to its max size. After that the Par Eden Space pool increases. In addition at any time the ParNewGC does not run anymore. The cleanup part improved the situation but didn't resolve the problem. Has anybody an idea why my heap isn't completly cleaned?

I think I have found a solution for my problem: I removed the JSR223 stuff und now use the PythonInterpreter directly.

Upload files with HTTPWebrequest from SilverLight application (Why does Content Length is 13?)

Friends!
Help me, please!
I try to post file from Silverlight. I use such class:
public class HttpHelper
{
public WebRequest Request { get; set; }
public Stream Filestream { get; private set; }
public HttpHelper(Stream filestream)
{
Request = WebRequest.Create("http://www.mysite.com/recieve");
Request.Method = "POST";
Request.ContentType = "application/octet-stream";
Filestream = filestream;
}
private static void BeginFilePostRequest(IAsyncResult ar)
{
HttpHelper helper = ar.AsyncState as HttpHelper;
if (helper != null)
{
byte[] bytes = new byte[helper.Filestream.Length];
int sf = helper.Filestream.Read(bytes, 0, (int)helper.Filestream.Length);
//helper.Request.ContentLength = bytes.Length; //It doesn't work in SL
using (StreamWriter writer = new StreamWriter(helper.Request.EndGetRequestStream(ar)))
{
writer.Write(bytes);
}
helper.Request.BeginGetResponse(new AsyncCallback(HttpHelper.BeginResponse), helper);
}
}
private static void BeginResponse(IAsyncResult ar)
{
HttpHelper helper = ar.AsyncState as HttpHelper;
if (helper != null)
{
HttpWebResponse response = (HttpWebResponse)helper.Request.EndGetResponse(ar);
if (response != null)
{
Stream stream = response.GetResponseStream();
if (stream != null)
{
using (StreamReader reader = new StreamReader(stream))
{
//anything...
}
}
}
}
}
public void PostFile()
{
this.Request.BeginGetRequestStream(new AsyncCallback(HttpHelper.BeginFilePostRequest), this);
}
}
I have Stream in my silverlight application and try to call PostFile by click submit button:
private void submit_button_Click(object sender, RoutedEventArgs e)
{
//...
HttpHelper helper = new HttpHelper(memory_stream);
helper.PostFile();
}
But mysite recieve request without file. It just has ContentLength 13. What's problem?

Try Flush on your writer before exiting the using block, you should also call Close on the stream retrieved from EndGetRequestStream.

you HAVE TO Flush and Dispose HTTP request stream and all downstream streams, then it works.

FileInputFormat where filename is KEY and text contents are VALUE

I'd like to use an entire file as a single record for MAP processing, with the filename as the key.
I've read the following post: How to get Filename/File Contents as key/value input for MAP when running a Hadoop MapReduce Job?
and while the theory of the top answer is solid, no code or "how-to" is actually provided.
Here is my custom FileInputFormat and the corresponding RecordReader, which compile, yet do not produce ANY record data.
Thanks for any help.
public class CommentsInput
extends FileInputFormat<Text,Text> {
protected boolean isSplitable(FileSystem fs, Path filename)
{
return false;
}
#Override
public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext ctx)
throws IOException, InterruptedException {
return new CommentFileRecordReader((FileSplit) split, ctx.getConfiguration());
}
/////////////////////////
public class CommentFileRecordReader
extends RecordReader<Text,Text> {
private InputStream in;
private long start;
private long length;
private long position;
private Text key;
private Text value;
private boolean processed;
private FileSplit fileSplit;
private Configuration conf;
public CommentFileRecordReader(FileSplit fileSplit, Configuration conf) throws IOException
{
this.fileSplit = fileSplit;
this.conf=conf;
}
/** Boilerplate initialization code for file input streams. */
#Override
public void initialize(InputSplit split,
TaskAttemptContext context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
fileSplit = (FileSplit) split;
this.start = fileSplit.getStart();
this.length = fileSplit.getLength();
this.position = 0;
this.processed = false;
Path path = fileSplit.getPath();
FileSystem fs = path.getFileSystem(conf);
FSDataInputStream in = fs.open(path);
CompressionCodecFactory codecs = new CompressionCodecFactory(conf);
CompressionCodec codec = codecs.getCodec(path);
if (codec != null)
this.in = codec.createInputStream(in);
else
this.in = in;
// If using Writables:
// key = new Text();
// value = new Text();
}
public boolean next(Text key, Text value) throws IOException
{
if(!processed)
{
key = new Text(fileSplit.getPath().toString());
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream in = null;
byte[] contents = new byte[(int) fileSplit.getLength()];
try
{
in = fs.open(file);
IOUtils.readFully(in, contents, 0, contents.length);
value.set(contents.toString());
}
finally
{
IOUtils.closeStream(in);
}
processed = true;
return true;
}
return false;
}
#Override
public boolean nextKeyValue() throws IOException {
// TODO parse the next key value, update position and return true.
return false;
}
#Override
public Text getCurrentKey() {
return key;
}
#Override
public Text getCurrentValue() {
return value;
}
/** Returns our progress within the split, as a float between 0 and 1. */
#Override
public float getProgress() {
if (length == 0)
return 0.0f;
return Math.min(1.0f, position / (float)length);
}
#Override
public void close() throws IOException {
if (in != null)
in.close();
}
}

You need to find a way to define your own key class and make sure your classes use it. You can look up how to define your own key class and you can get a file name by calling hte getName() method on its path then use it to make your key.

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

Streaming a File From HDFS Address in Apache Flink - apache-flink

Related

Reading and writing the file from and to winSCP from the S3 object store

Download accelerator causes org.apache.catalina.connector.ClientAbortException: java.io.IOException when providing download from backing bean

Increasing heap by excessive use oft Java ScriptEngine (Jyhton)

Upload files with HTTPWebrequest from SilverLight application (Why does Content Length is 13?)

FileInputFormat where filename is KEY and text contents are VALUE

Categories

Resources