public void testDifferentLabels() throws Exception {
ClassPathResource resource = new ClassPathResource("/labeled");
File file = resource.getFile();
LabelAwareSentenceIterator iter = LabelAwareUimaSentenceIterator.createWithPath(file.getAbsolutePath());
TokenizerFactory t = new UimaTokenizerFactory();
ParagraphVectors vec = new ParagraphVectors.Builder()
.minWordFrequency(1).labels(Arrays.asList("negative", "neutral","positive"))
.layerSize(100)
.stopWords(new ArrayList<String>())
.windowSize(5).iterate(iter).tokenizerFactory(t).build();
vec.fit();
assertNotEquals(vec.lookupTable().vector("UNK"), vec.lookupTable().vector("negative"));
assertNotEquals(vec.lookupTable().vector("UNK"),vec.lookupTable().vector("positive"));
assertNotEquals(vec.lookupTable().vector("UNK"),vec.lookupTable().vector("neutral"));}// Gets Path to Text file
String filePath = new File(dataLocalPath,"raw_sentences.txt").getAbsolutePath();
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(filePath); public static void main(String[] args) throws Exception {
dataLocalPath = DownloaderUtility.NLPDATA.Download();
// Gets Path to Text file
String filePath = new File(dataLocalPath,"raw_sentences.txt").getAbsolutePath();
log.info("Load & Vectorize Sentences....");
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(filePath);
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
/*
CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
Additionally it forces lower case for all tokens.
*/
t.setTokenPreProcessor(new CommonPreprocessor());package org.deeplearning4j.examples.nlp.word2vec;
import org.deeplearning4j.examples.download.DownloaderUtility;
import org.deeplearning4j.models.word2vec.Word2Vec;
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor;
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.util.Collection;
/**
* Created by agibsonccc on 10/9/14.
*
* Neural net that processes text into wordvectors. See below url for an in-depth explanation.
* https://deeplearning4j.org/word2vec.html
*/
public class Word2VecRawTextExample {
private static Logger log = LoggerFactory.getLogger(Word2VecRawTextExample.class);
public static String dataLocalPath;
public static void main(String[] args) throws Exception {
dataLocalPath = DownloaderUtility.NLPDATA.Download();
// Gets Path to Text file
String filePath = new File(dataLocalPath,"raw_sentences.txt").getAbsolutePath();
log.info("Load & Vectorize Sentences....");
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(filePath);
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
/*
CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
Additionally it forces lower case for all tokens.
*/
t.setTokenPreProcessor(new CommonPreprocessor());
log.info("Building model....");
Word2Vec vec = new Word2Vec.Builder()
.minWordFrequency(5)
.iterations(1)
.layerSize(100)
.seed(42)
.windowSize(5)
.iterate(iter)
.tokenizerFactory(t)
.build();
log.info("Fitting Word2Vec model....");
vec.fit();
log.info("Writing word vectors to text file....");
// Prints out the closest 10 words to "day". An example on what to do with these Word Vectors.
log.info("Closest Words:");
Collection<String> lst = vec.wordsNearestSum("day", 10);
log.info("10 Words closest to 'day': {}", lst);
}
}Mechanism for handling general NLP tasks in DL4J.
addToken(new VocabWord(1.0,"myword"));addWordToIndex(0, Word2Vec.UNK);
putVocabWord(Word2Vec.UNK);Iteration of words, documents, and sentences for language processing in DL4J.
SentenceIterator iter = new LineSentenceIterator(new File("your file"));Collection<String> sentences = ...;
SentenceIterator iter = new CollectionSentenceIterator(sentences);SentenceIterator iter = new FileSentenceIterator(new File("your dir or file"));TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
Tokenizer tokenizer = tokenizerFactory.tokenize("mystring");
//iterate over the tokens
while(tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
}
//get the whole list of tokens
List<String> tokens = tokenizer.getTokens();