Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit f8237b3

Browse files
committed
get the loading of the document to work
1 parent d993848 commit f8237b3

File tree

4 files changed

+513
-0
lines changed

4 files changed

+513
-0
lines changed

‎pom.xml

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,109 @@
88
<artifactId>java-text-embedding</artifactId>
99
<version>1.0.1</version>
1010

11+
<properties>
12+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
13+
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
1114

15+
<maven.compiler.source>1.8</maven.compiler.source>
16+
<maven.compiler.target>1.8</maven.compiler.target>
17+
<java.version>1.8</java.version>
18+
19+
</properties>
20+
21+
22+
<dependencies>
23+
24+
<dependency>
25+
<groupId>com.google.guava</groupId>
26+
<artifactId>guava</artifactId>
27+
<version>20.0</version>
28+
</dependency>
29+
30+
<dependency>
31+
<groupId>org.projectlombok</groupId>
32+
<artifactId>lombok</artifactId>
33+
<version>1.16.10</version>
34+
<scope>provided</scope>
35+
</dependency>
36+
37+
<dependency>
38+
<groupId>com.alibaba</groupId>
39+
<artifactId>fastjson</artifactId>
40+
<version>1.2.33</version>
41+
</dependency>
42+
43+
<!-- Logging -->
44+
<dependency>
45+
<groupId>org.slf4j</groupId>
46+
<artifactId>slf4j-api</artifactId>
47+
<version>1.7.20</version>
48+
</dependency>
49+
50+
<dependency>
51+
<groupId>org.slf4j</groupId>
52+
<artifactId>slf4j-simple</artifactId>
53+
<version>1.7.20</version>
54+
</dependency>
55+
56+
<dependency>
57+
<groupId>org.apache.httpcomponents</groupId>
58+
<artifactId>httpclient</artifactId>
59+
<version>4.5.2</version>
60+
</dependency>
61+
62+
<dependency>
63+
<groupId>net.lingala.zip4j</groupId>
64+
<artifactId>zip4j</artifactId>
65+
<version>1.3.2</version>
66+
</dependency>
67+
68+
</dependencies>
69+
70+
<build>
71+
72+
<finalName>java-basic-blockchain</finalName>
73+
74+
<plugins>
75+
76+
<plugin>
77+
<groupId>org.apache.maven.plugins</groupId>
78+
<artifactId>maven-compiler-plugin</artifactId>
79+
<version>3.5.1</version>
80+
<configuration>
81+
<source>1.8</source>
82+
<target>1.8</target>
83+
<encoding>UTF-8</encoding>
84+
</configuration>
85+
</plugin>
86+
87+
88+
89+
<plugin>
90+
<groupId>org.apache.maven.plugins</groupId>
91+
<artifactId>maven-shade-plugin</artifactId>
92+
<version>3.0.0</version>
93+
94+
95+
<executions>
96+
<execution>
97+
<phase>package</phase>
98+
<goals>
99+
<goal>shade</goal>
100+
</goals>
101+
<configuration>
102+
<finalName>java-basic-blockchain</finalName>
103+
<transformers>
104+
<transformer
105+
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
106+
<mainClass>com.github.chen0040.embeddings.GloveModelDemo</mainClass>
107+
</transformer>
108+
</transformers>
109+
</configuration>
110+
</execution>
111+
</executions>
112+
</plugin>
113+
</plugins>
114+
</build>
12115

13116
</project>
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
package com.github.chen0040.embeddings;
2+
3+
import com.github.chen0040.embeddings.utils.HttpClient;
4+
import net.lingala.zip4j.core.ZipFile;
5+
import net.lingala.zip4j.exception.ZipException;
6+
import org.slf4j.Logger;
7+
import org.slf4j.LoggerFactory;
8+
9+
import java.io.*;
10+
import java.util.Arrays;
11+
import java.util.HashMap;
12+
import java.util.List;
13+
import java.util.Map;
14+
15+
public class GloVeModel {
16+
17+
private static final String url = "http://nlp.stanford.edu/data/glove.6B.zip";
18+
private Map<String, float[]> word2em = new HashMap<>();
19+
private static final Logger logger = LoggerFactory.getLogger(GloVeModel.class);
20+
private int dimension = -1;
21+
22+
public static List<Integer> getAvailableDimensionList() {
23+
return Arrays.asList(50, 100, 200, 300);
24+
}
25+
26+
private static final String getGloVeTextFileName(int dimension){
27+
return "glove.6B." + dimension + "d.txt";
28+
}
29+
30+
public Map<String, float[]> load100() {
31+
return load(100);
32+
}
33+
34+
public Map<String, float[]> load50() {
35+
return load(50);
36+
}
37+
38+
public Map<String, float[]> load200() {
39+
return load(200);
40+
}
41+
42+
public Map<String, float[]> load300() {
43+
return load(300);
44+
}
45+
46+
public Map<String, float[]> load(int dimension) {
47+
return load("/tmp", dimension);
48+
}
49+
50+
public float[] encodeWord(String word) {
51+
word = word.toLowerCase();
52+
if(word2em.containsKey(word)) {
53+
return word2em.get(word);
54+
}
55+
return null;
56+
}
57+
58+
public int size() {
59+
return word2em.size();
60+
}
61+
62+
public Map<String, float[]> load(String dirPath, int dimension){
63+
this.dimension = -1;
64+
word2em.clear();
65+
String sourceFile100 = getGloVeTextFileName(dimension);
66+
String filePath = dirPath + "/" + sourceFile100;
67+
File file = new File(filePath);
68+
if(!file.exists()){
69+
String zipFilePath = dirPath + "/glove.6B.zip";
70+
if(!new File(zipFilePath).exists()) {
71+
if (!HttpClient.downloadFile(url, zipFilePath)) {
72+
return word2em;
73+
}
74+
}
75+
76+
if(!unZip(zipFilePath, dirPath)){
77+
return word2em;
78+
}
79+
}
80+
81+
try(BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(filePath))))){
82+
String line;
83+
while((line=reader.readLine()) != null) {
84+
String[] parts = line.split(" ");
85+
String word = parts[0];
86+
float[] vec = new float[dimension];
87+
for(int i=1; i <= dimension; ++i) {
88+
vec[i-1] = Float.parseFloat(parts[i]);
89+
}
90+
word2em.put(word, vec);
91+
92+
}
93+
} catch (IOException e) {
94+
logger.error("Failed to read file " + filePath, e);
95+
word2em.clear();
96+
return new HashMap<>();
97+
}
98+
99+
this.dimension = dimension;
100+
101+
return word2em;
102+
103+
}
104+
105+
private boolean unZip(String zipFilePath, String dirPath) {
106+
try {
107+
ZipFile zipFile = new ZipFile(zipFilePath);
108+
zipFile.extractAll(dirPath);
109+
return true;
110+
}
111+
catch (ZipException e) {
112+
logger.error("Failed to unzip " + zipFilePath, e);
113+
return false;
114+
}
115+
}
116+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
package com.github.chen0040.embeddings;
2+
3+
public class GloVeModelDemo {
4+
public static void main(String[] args) {
5+
GloVeModel model = new GloVeModel();
6+
model.load100();
7+
System.out.println("word2em size: " + model.size());
8+
}
9+
10+
}

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /