package org.deeplearning4j.examples.nlp.word2vec;
import org.deeplearning4j.examples.download.DownloaderUtility;
import org.deeplearning4j.models.word2vec.Word2Vec;
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor;
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import org.slf4j.LoggerFactory;
import java.util.Collection;
* Created by agibsonccc on 10/9/14.
* Neural net that processes text into wordvectors. See below url for an in-depth explanation.
* https://deeplearning4j.org/word2vec.html
public class Word2VecRawTextExample {
private static Logger log = LoggerFactory.getLogger(Word2VecRawTextExample.class);
public static String dataLocalPath;
public static void main(String[] args) throws Exception {
dataLocalPath = DownloaderUtility.NLPDATA.Download();
// Gets Path to Text file
String filePath = new File(dataLocalPath,"raw_sentences.txt").getAbsolutePath();
log.info("Load & Vectorize Sentences....");
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(filePath);
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
Additionally it forces lower case for all tokens.
t.setTokenPreProcessor(new CommonPreprocessor());
log.info("Building model....");
Word2Vec vec = new Word2Vec.Builder()
log.info("Fitting Word2Vec model....");
log.info("Writing word vectors to text file....");
// Prints out the closest 10 words to "day". An example on what to do with these Word Vectors.
log.info("Closest Words:");
Collection<String> lst = vec.wordsNearestSum("day", 10);
log.info("10 Words closest to 'day': {}", lst);