package com.javaweb.cms.config;
|
|
import java.math.BigInteger;
|
import java.util.ArrayList;
|
import java.util.HashMap;
|
import java.util.List;
|
import java.util.StringTokenizer;
|
|
public class SimHash {
|
private String tokens;
|
|
private BigInteger intSimHash;
|
|
private String strSimHash;
|
|
private int hashbits = 64;
|
|
public SimHash(String tokens) {
|
this.tokens = tokens;
|
this.intSimHash = this.simHash();
|
}
|
|
public SimHash(String tokens, int hashbits) {
|
this.tokens = tokens;
|
this.hashbits = hashbits;
|
this.intSimHash = this.simHash();
|
}
|
|
HashMap<String, Integer> wordMap = new HashMap<String, Integer>();
|
|
public BigInteger simHash() {
|
// 定义特征向量/数组
|
int[] v = new int[this.hashbits];
|
// 1、将文本去掉格式后, 分词.
|
StringTokenizer stringTokens = new StringTokenizer(this.tokens);
|
while (stringTokens.hasMoreTokens()) {
|
String temp = stringTokens.nextToken();
|
// 2、将每一个分词hash为一组固定长度的数列.比如 64bit 的一个整数.
|
BigInteger t = this.hash(temp);
|
for (int i = 0; i < this.hashbits; i++) {
|
BigInteger bitmask = new BigInteger("1").shiftLeft(i);
|
// 3、建立一个长度为64的整数数组(假设要生成64位的数字指纹,也可以是其它数字),
|
// 对每一个分词hash后的数列进行判断,如果是1000...1,那么数组的第一位和末尾一位加1,
|
// 中间的62位减一,也就是说,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕.
|
if (t.and(bitmask).signum() != 0) {
|
// 这里是计算整个文档的所有特征的向量和
|
// 这里实际使用中需要 +- 权重,而不是简单的 +1/-1,
|
v[i] += 1;
|
} else {
|
v[i] -= 1;
|
}
|
}
|
}
|
BigInteger fingerprint = new BigInteger("0");
|
StringBuffer simHashBuffer = new StringBuffer();
|
for (int i = 0; i < this.hashbits; i++) {
|
// 4、最后对数组进行判断,大于0的记为1,小于等于0的记为0,得到一个 64bit 的数字指纹/签名.
|
if (v[i] >= 0) {
|
fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
|
simHashBuffer.append("1");
|
} else {
|
simHashBuffer.append("0");
|
}
|
}
|
this.strSimHash = simHashBuffer.toString();
|
System.out.println(this.strSimHash + " length " + this.strSimHash.length());
|
return fingerprint;
|
}
|
|
private BigInteger hash(String source) {
|
if (source == null || source.length() == 0) {
|
return new BigInteger("0");
|
} else {
|
char[] sourceArray = source.toCharArray();
|
BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
|
BigInteger m = new BigInteger("1000003");
|
BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(new BigInteger("1"));
|
for (char item : sourceArray) {
|
BigInteger temp = BigInteger.valueOf((long) item);
|
x = x.multiply(m).xor(temp).and(mask);
|
}
|
x = x.xor(new BigInteger(String.valueOf(source.length())));
|
if (x.equals(new BigInteger("-1"))) {
|
x = new BigInteger("-2");
|
}
|
return x;
|
}
|
}
|
|
public int hammingDistance(SimHash other) {
|
|
BigInteger x = this.intSimHash.xor(other.intSimHash);
|
int tot = 0;
|
|
// 统计x中二进制位数为1的个数
|
// 我们想想,一个二进制数减去1,那么,从最后那个1(包括那个1)后面的数字全都反了,对吧,然后,n&(n-1)就相当于把后面的数字清0,
|
// 我们看n能做多少次这样的操作就OK了。
|
|
while (x.signum() != 0) {
|
tot += 1;
|
x = x.and(x.subtract(new BigInteger("1")));
|
}
|
return tot;
|
}
|
|
public int getDistance(String str1, String str2) {
|
int distance;
|
if (str1.length() != str2.length()) {
|
distance = -1;
|
} else {
|
distance = 0;
|
for (int i = 0; i < str1.length(); i++) {
|
if (str1.charAt(i) != str2.charAt(i)) {
|
distance++;
|
}
|
}
|
}
|
return distance;
|
}
|
|
public List subByDistance(SimHash simHash, int distance) {
|
// 分成几组来检查
|
int numEach = this.hashbits / (distance + 1);
|
List characters = new ArrayList();
|
|
StringBuffer buffer = new StringBuffer();
|
|
int k = 0;
|
for (int i = 0; i < this.intSimHash.bitLength(); i++) {
|
// 当且仅当设置了指定的位时,返回 true
|
boolean sr = simHash.intSimHash.testBit(i);
|
|
if (sr) {
|
buffer.append("1");
|
} else {
|
buffer.append("0");
|
}
|
|
if ((i + 1) % numEach == 0) {
|
// 将二进制转为BigInteger
|
BigInteger eachValue = new BigInteger(buffer.toString(), 2);
|
System.out.println("----" + eachValue);
|
buffer.delete(0, buffer.length());
|
characters.add(eachValue);
|
}
|
}
|
|
return characters;
|
}
|
|
public static void main(String[] args) {
|
String s = "This is a test string for testing";
|
SimHash hash1 = new SimHash(s, 64);
|
System.out.println(hash1.intSimHash + " " + hash1.intSimHash.bitLength());
|
hash1.subByDistance(hash1, 3);
|
|
s = "This is a test string for testing, This is a test string for testing abcdef";
|
SimHash hash2 = new SimHash(s, 64);
|
System.out.println(hash2.intSimHash + " " + hash2.intSimHash.bitCount());
|
hash1.subByDistance(hash2, 3);
|
|
s = "This is a test string for testing als";
|
SimHash hash3 = new SimHash(s, 64);
|
System.out.println(hash3.intSimHash + " " + hash3.intSimHash.bitCount());
|
hash1.subByDistance(hash3, 4);
|
|
System.out.println("============================");
|
|
int dis = hash1.getDistance(hash1.strSimHash, hash2.strSimHash);
|
System.out.println(hash1.hammingDistance(hash2) + " " + dis);
|
|
int dis2 = hash1.getDistance(hash1.strSimHash, hash3.strSimHash);
|
System.out.println(hash1.hammingDistance(hash3) + " " + dis2);
|
|
//通过Unicode编码来判断中文
|
/*String str = "中国chinese";
|
for (int i = 0; i < str.length(); i++) {
|
System.out.println(str.substring(i, i + 1).matches("[\\u4e00-\\u9fbb]+"));
|
}*/
|
|
}
|
|
}
|