当前位置:文档之家› JAVA实现大文件排序

JAVA实现大文件排序

package com.scott.util;import java.io.*;import java.util.ArrayList;import parator;import java.util.Iterator;import java.util.List;/*** Created by Scott on 2017/11/1.*/public class LargeFileDataSort {// 测试大文件路径public final static String testFilePath = "E:/dataTest/largeFileData.txt";public final static String resultFilePath = "E:/dataTest/largeFileResult.txt";// 切分大文件的小文件大小MB, 默认为100MBprivate final static int size = 200;private static int byteSize = size * 1024 * 1024;public static void main(String[] args) throws IOException {// 生成测试文件createTestData();Long start = System.currentTimeMillis();work();Long end = System.currentTimeMillis();System.out.println((end - start) / 1000/ 60);}/*** 切分文件每份大小*/public static void work() throws IOException {File file = new File(testFilePath);if (!file.exists()) {return;}// 2.1 得到文件大小MBdouble mbsize = file.length() / 1024 / 1024;// 2.2 计算得到切分的文件数double fileNum = Math.ceil(mbsize / size);// 2.3 临时文件List<File> tempFileList = createTempFileList(file, fileNum);// 2.3 切分文件divAndFirstSort(file, tempFileList);// 2.4 递归排序(每个文件读取多少数据放到内存排序后合并到结果文件)// 排序合并开始mergeLargeFile(tempFileList);// 2.5 TODO 把临时文件删除}/*** 生成测试文件*/public static void createTestData() {StringBuffer sb = new StringBuffer();BufferedWriter bw = null;try {File testFile = new File(testFilePath);if (!testFile.exists()) {testFile.createNewFile();}bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(testFile)));for (long i = 1; i <= 27777777; i++) {sb.setLength(0);sb.append(i).append("@@");// sb.append(random.nextInt(100000)).append("@@");sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");sb.append("\n");bw.write(sb.toString());if ((i + 1) % 5000 == 0) {bw.flush();}System.out.println(i);}} catch (IOException e) {System.out.println("生成测试文件失败!" + e.getMessage());} finally {try {if (bw != null) {bw.close();}} catch (IOException e) {}}}/*** 把临时文件合并到结果文件* @param tempFileList* @throws IOException*/public static void mergeLargeFile(List<File> tempFileList) throws IOException {List<FileEntity> bwList = new ArrayList<FileEntity>();for(int i=0; i< tempFileList.size(); i++) {FileEntity le = new FileEntity(new BufferedReader(new InputStreamReader(new FileInputStream(tempFileList.get(i)))));bwList.add(le);}BufferedWriter resultBw = null;try {resultBw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(resultFilePath)));Long count = 0L;FileEntity fe = null;while ((fe = getFirstFileEntity(bwList)) != null) {System.out.println("--- 写文件id: " + fe.getId());// 写入符合条件的一行数据resultBw.write(fe.getLine() + "\n");// 准备下一行fe.nextLine();// 清缓冲流if (count % 1000 == 0) {resultBw.flush();}}} catch (Exception e) {} finally {if (resultBw != null) {try {resultBw.close();} catch (IOException e) {}}}// 关闭for(int i=0; i< bwList.size(); i++) {bwList.get(i).close();}}/*** 从切分的文件中按序行读取(因为切分文件时已经做好了排序)* @param bwList* @return*/private static FileEntity getFirstFileEntity(List<FileEntity> bwList) { if (bwList.size() == 0) {return null;}Iterator<FileEntity> it = bwList.iterator();while (it.hasNext()) {FileEntity fe = it.next();// 如果文件读到完就关闭流和删除在集合的文件流if (fe.getLine() == null) {fe.close();it.remove();}}if (bwList.size() == 0) {return null;}// 排序获取一行数据bwList.sort(new FileEntityComparator());// 返回第一个符合条件的文件对象return bwList.get(0);}/*** 切分文件并做第一次排序* @param file* @param tempFileListprivate static void divAndFirstSort(File file, List<File> tempFileList) {BufferedReader br = null;try {// 读取大文件br = new BufferedReader(new InputStreamReader(new FileInputStream(file)));// 行数据保存对象String line = null;// 临时文件索引int index = tempFileList.size() - 1;// 第一个临时文件File tempFile = tempFileList.get(index);List<String> lineList = new ArrayList<>();int byteSum = 0;// 循环临时文件并循环大文件while ((line = br.readLine()) != null) {line += "\n";byteSum += line.getBytes().length;// 如果长度达到每个文件大小则重新计算if (byteSum >= byteSize) {// 写入到文件putLineListToFile(tempFileList.get(index), lineList);index--;byteSum = line.getBytes().length;lineList.clear();}lineList.add(line);}if (lineList.size() > 0) {// 写入到文件putLineListToFile(tempFileList.get(0), lineList);}} catch (FileNotFoundException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();} finally {try {if (br != null) {br.close();}} catch (IOException e) {}}/*** 把数据写到临时文件* @param lineList*/private static void putLineListToFile(File file, List<String> lineList) throws IOException { FileOutputStream tempFileFos = null;try {// 很关键的一步,第一次写入文件必须排序lineList.sort(new LineComparator());tempFileFos = new FileOutputStream(file);for(int i=0; i< lineList.size(); i++) {tempFileFos.write(lineList.get(i).getBytes());}} finally {if (tempFileFos != null) {tempFileFos.close();}}}/*** 生成临时文件* @param fileNum* @return*/private static List<File> createTempFileList(File file, double fileNum) {List<File> tempFileList = new ArrayList<File>();String fileFolder = file.getParent();String name = file.getName();for (int i = 0; i < fileNum; i++) {File tempFile = new File(fileFolder + "/" + name + ".temp_" + i + ".txt");if (tempFile.exists()) {tempFile.delete();}try {tempFile.createNewFile();} catch (IOException e) {e.printStackTrace();}tempFileList.add(tempFile);}return tempFileList;}public static int compare(String o1, String o2) {String o1Id = o1.substring(0, o1.indexOf("@@"));String o2Id = o2.substring(0, o2.indexOf("@@"));// 从小到大return Integer.parseInt(o1Id) - Integer.parseInt(o2Id);// 从大到小// return Integer.parseInt(o2Id) - Integer.parseInt(o1Id);}}/*** 排序*/class LineComparator implements Comparator<String> {@Overridepublic int compare(String o1, String o2) {return pare(o1, o2);}}/*** 排序类*/class FileEntityComparator implements Comparator<FileEntity> { @Overridepublic int compare(FileEntity o1, FileEntity o2) {return pare(o1.getLine(), o2.getLine());}}class FileEntity {private Long id = null;private String line = null;private BufferedReader br;public FileEntity(BufferedReader br) throws IOException {this.br = br;// 初始化读取第一行setLineId();}/*** 使用来排序的数据* @throws IOException*/private void setLineId() throws IOException {line = br.readLine();if (line != null) {try {id = Long.parseLong(line.substring(0, line.indexOf("@@")));} catch (NumberFormatException e) {id = null;}}}/*** 关闭流*/public void close() {if (this.br != null) {try {this.br.close();} catch (Exception e) {}}}/*** 读取下一行* @return*/public FileEntity nextLine() {try {setLineId();} catch (IOException e) {}return this;}public Long getId() {return id;}public void setId(Long id) {this.id = id;}public String getLine() {return line;}}。

相关主题