Wednesday, March 2, 2016

Sequence file in Hadoop


 What is the sequence file in Hadoop?
·              File which stores key& value in binary format
·              As it is binary format , we can compress that , results it comsumes less Diskspce, less I/O operation,     less bandwith
·              It also resolves small file problem (whole data of the small file becomes the value of the sequence        file )  

Now we are going to look in to, how to convert large number of small files to sequence file

Below is the java code for writing sequence file


 public class SequenceFileWritter {  
     public static void main(String[] args) throws IOException {  
        String uri = args[1];  
        Configuration conf = new Configuration();  
        FileSystem fs = FileSystem.get(conf);  
        Path path = new Path(uri);  
        Text key = new Text();  
        Text value = new Text();  
        File infolder = new File(args[0]);  
        SequenceFile.Writer writer = null;  
        try {  
            FSDataOutputStream stm = fs.create(path);  
            writer = SequenceFile.createWriter(conf, stm, key.getClass(), value.getClass(),  
             SequenceFile.CompressionType.BLOCK, new DefaultCodec(), new Metadata());  
            File[] listOfFiles = infolder.listFiles();  
            System.out.printf("Folder is ", infolder.toString());  
            if (null != listOfFiles) {  
               System.out.printf("# of files ", listOfFiles.length);  
               for (int i = 0; i < listOfFiles.length; i++) {  
                  if (listOfFiles[i].isFile()) {  
                      key.set(listOfFiles[i].getName());  
                      value.set(listOfFiles[i].getPath());  
                      writer.append(key, value);  
                      System.out.printf("[%s]\t%s\t%s\n", writer.getLength(), key, value);  
                  } else if (listOfFiles[i].isDirectory()) {  
                      System.out.println("Directory " + listOfFiles[i].getName());  
                  }  
               }  
            } else {  
               System.out.printf("list of files is null ", " check ");  
            }  
        } finally {  
            IOUtils.closeStream(writer);  
        }  
     }  
 }  


To read sequence file

 public class SequenceFileRead  
 {        
  public static void main(String[] args) throws IOException {  
     String uri = args[0];  
     Configuration conf = new Configuration();  
     Path path = new Path(uri);  
     SequenceFile.Reader reader = null;  
     FileSystem fs = FileSystem.get(conf);  
     try {      
     reader = new SequenceFile.Reader(fs, path, conf);  
     Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);  
     Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);  
     while (reader.next(key, value)) {  
      String syncSeen = reader.syncSeen() ? "sync" : "";  
      System.out.printf("[%s]\t%s\t%s\n", syncSeen, key, value);  
     }  
     } finally {  
        IOUtils.closeStream(reader);  
        }        
     }  
 }  


No comments:

Post a Comment