基于java对doc文档的分词,导入数据库

时间:2019-03-20
本文章向大家介绍基于java对doc文档的分词,导入数据库,主要包括基于java对doc文档的分词,导入数据库使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。

这篇word文档都是正规的文本文字,有一定的格式,其中没有图片等难以处理的内容

我也是刚学习对word文档的处理,其中也有很对不懂的地方

Apache POI是Apache软件基金会的开放源码函式库,POI提供API给Java程序对Microsoft Office格式档案读和写的功能。

1、首先我下载了poi的包  http://poi.apache.org/download.html 网址

2、然后就是利用函数对文档的处理

读取doc文档

  

public static String contextOfDoc(File file) {
        String str = "";
        try {
            FileInputStream fis = new FileInputStream(file);
            HWPFDocument doc = new HWPFDocument(fis);
            str = doc.getDocumentText();
            doc.close();
            fis.close();
        } catch (Exception e) {
            e.printStackTrace();
            // TODO: handle exception
        }
        return str;
    }

测试

public static void main(String[] args) {
        File file = new File("src/1.doc");
        String str = contextOfDoc(file);
        String[] arr = str.split("\r");
        for (int i = 9; i < 284; i++) {
            System.out.println(arr[i]);
        }
    }

先切分文档,分为目录和内容

public static String[] cataAndContext() {
        File file = new File("src/1.doc");
        String textAll = docIo.contextOfDoc(file);
        String[] str = textAll.split("第五篇");
        return str;
    }

对目录和内容分别切分

public static List<String> typePart(String str) {
        //File file = new File("src/1.doc");
        //all
        //String textAll = docIo.contextOfDoc(file);
        String[] partOne = str.split("新技术篇");
        //第一篇到目录
        String partOneCatalog = partOne[1].split("网络安全篇")[0];
        String partNest = partOne[1].split("网络安全篇")[1];
        //第二篇目录
        String partTowCatalog = partNest.split("基础篇")[0];
        partNest = partNest.split("基础篇")[1];
        //第三篇目录
        String partThreeCatalog = partNest.split("国家信息化政策规划篇")[0];
        partNest = partNest.split("国家信息化政策规划篇")[1];
        //第四篇目录
        String partForeCatalog = partNest.split("附录")[0];
        List<String> strList = new ArrayList<>();
        strList.add(partOneCatalog);
        strList.add(partTowCatalog);
        strList.add(partThreeCatalog);
        strList.add(partForeCatalog);
        return strList;
    }

对内容的处理

public static void main(String[] args) throws Exception {
        FileInputStream fis = new FileInputStream("src/3.doc");
        WordExtractor wordExtractor = new WordExtractor(fis);
        String[] paragraphs = wordExtractor.getParagraphText();
        List<String> lists = getParas(paragraphs);
        CRUD c = new CRUD();
        List<String> catas = c.getCatalogs();
        for (int i = 0; i < catas.size()-1; i++) {
            String context = getContext(catas.get(i), catas.get(i+1), lists);
            c.insertContext(catas.get(i), context);
        }
    }
    public static String getContext(String start,String end,List<String> paras) {
        String context = "";
        for (int i = 0; i < paras.size(); i++) {
            if (paras.get(i).equals(start)) {
                for (int j = i+1; j < paras.size(); j++) {
                    if(paras.get(j).equals(end)) {
                        return context;
                    }
                    context = context + paras.get(j);
                }
            }
        }
        return context;
    }
    
    public static List<String> getParas(String[] paras) {
        List<String> paraList = new ArrayList<>();
        for (int i = 289; i < paras.length; i++) {
            paraList.add(paras[i].trim());
        }
        return paraList;
    }

数据库的crud

public List<String> getCatalogs(){
        List<String> lists = new ArrayList<>();
        Connection connection = Dbuitl.getConnection();
        String sql = "select catalog from catalogs";
        PreparedStatement preparedStatement = null;
        ResultSet resultSet = null;   
        try {
            Statement statement = connection.createStatement();
            resultSet = statement.executeQuery(sql);
            while (resultSet.next()) {
                lists.add(resultSet.getString("catalog"));
            }
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally {
            Dbuitl.close(preparedStatement);
            Dbuitl.close(connection);
        }
        return lists;
    }
    
    public void insert(String type,String cata) {
        Connection connection = Dbuitl.getConnection();
        String sql = "insert into catalogs(type,catalog) value(?,?)";
        PreparedStatement preparedStatement = null;
        ResultSet resultSet = null;        
        try {
            preparedStatement = connection.prepareStatement(sql);
            preparedStatement.setString(1, type);
            preparedStatement.setString(2, cata);
            preparedStatement.executeUpdate();
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally {
            Dbuitl.close(preparedStatement);
            Dbuitl.close(connection);
        }
    }
    
    public void insertContext(String catalog,String context) {
        Connection connection = Dbuitl.getConnection();
        String sql = "insert into context(catalog,context) value(?,?)";
        PreparedStatement preparedStatement = null;
        ResultSet resultSet = null;        
        try {
            preparedStatement = connection.prepareStatement(sql);
            preparedStatement.setString(1, catalog);
            preparedStatement.setString(2, context);
            preparedStatement.executeUpdate();
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally {
            Dbuitl.close(preparedStatement);
            Dbuitl.close(connection);
        }
    }
    
    public void insertSheet(String sheet,String type) {
        Connection connection = Dbuitl.getConnection();
        String sql = "insert into sheet(sheet,type) value(?,?)";
        PreparedStatement preparedStatement = null;
        ResultSet resultSet = null;        
        try {
            preparedStatement = connection.prepareStatement(sql);
            preparedStatement.setString(1, sheet);
            preparedStatement.setString(2, type);
            preparedStatement.executeUpdate();
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally {
            Dbuitl.close(preparedStatement);
            Dbuitl.close(connection);
        }
    }