package org.deeplearning4j.examples.nlp.word2vec;
import org.datavec.api.util.ClassPathResource;
import org.deeplearning4j.models.word2vec.Word2Vec;
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocesor;
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Collection;
/**
* Created by agibsonccc on 10/9/14.
* 将文本处理成词向量的神经网络,查看下面的URL获取深入解释。
* https://deeplearning4j.org/word2vec.html
*/
public class Word2VecRawTextExample {
private static Logger log =
LoggerFactory.getLogger(Word2VecRawTextExample.class);
public static void main(String[] args) throws Exception {
// 得到文本文件的路径
String filePath = new ClassPathResource("raw_sentences.txt").getFile().getAbsolutePath();
log.info("加载 & 向量化句子....");
// 每行前后空白间隔
SentenceIterator iter = new BasicLineIterator(filePath);
// 每行用空格分割以获取单词
TokenizerFactory t = new DefaultTokenizerFactory();
/* CommonPreprocessor 将应用如下正则表达式到每个词: [\d\.:,"'\(\)\[\]|/?!;]+
所以有效的删除所有数字,标点符号,和特符号,并把所有词转换为小写。 */
t.setTokenPreProcessor(new CommonPreprocessor());
log.info("构建模型....");
Word2Vec vec = new Word2Vec.Builder()
.minWordFrequency(5)
.iterations(1)
.layerSize(100)
.seed(42)
.windowSize(5)
.iterate(iter)
.tokenizerFactory(t)
.build();
log.info("拟合 Word2Vec 模型....");
vec.fit();
log.info("将词向量写入文本文件....");
// 打印出最接近“day”的10个词。一个如何处理这些单词向量的例子。
log.info("最接近的词:");
Collection<String> lst = vec.wordsNearestSum("day", 10);
log.info("10 个最接近 'day': {}", lst);
// TODO 解决丢失的 UiServer
// UiServer server = UiServer.getInstance();
// System.out.println("Started on port " + server.getPort()); }}