Java实现哈希（相似度）算法，用于试题相似度，字符串相似度等场景-白红宇

Java实现哈希（相似度）算法，用于试题相似度，字符串相似度等场景

阅读量：117 次

发布时间：2019-02-27

本文共 4488 字，大约阅读时间需要 14 分钟。

??????????????????????????????????????????????????????????????????????????????????????????????????????????

???????????

??????????Java?HashMap?????????????????????????????????

????????????????????

???????????????????????????51???????100101?101011?

?????????????????2????????????

?????????????????????????

??????????????????????????????simhash???

??????????????????????????????????????????

???????????

???????????????????????

???????HTML???????????????????

???????????????????????????

???????????????????????????????????????

?????????????????????????????

???????

????????????????????

???????AND?XOR?????????????

?????????????????????

???????

public class SimHashAlgorithm {    private String tokens;    private BigInteger strSimHash;    private int hashbits = 64;    public SimHashAlgorithm(String tokens) {        this.tokens = tokens;        this.strSimHash = computeSimHash(tokens);    }    private BigInteger computeSimHash(String tokens) {        tokens = cleanResume(tokens);        int[] v = new int[hashbits];        List
   
     termList = StandardTokenizer.segment(tokens);        Map
    
      wordCount = new HashMap<>();        Map
     
       stopNatures = new HashMap<>();        stopNatures.put("w", "");        wordCount.put("n", 2);        for (Term term : termList) {            String word = term.word;            String nature = term.nature.toString();            if (stopNatures.containsKey(nature)) continue;            if (wordCount.containsKey(word)) {                int count = wordCount.get(word);                if (count > 5) continue;                wordCount.put(word, count + 1);            } else {                wordCount.put(word, 1);            }            BigInteger t = hash(word);            for (int i = 0; i < hashbits; i++) {                BigInteger bitmask = BigInteger.ONE.shiftLeft(i);                if (t.and(bitmask).signum() != 0) {                    v[i] += wordCount.get(word) * weightOfNature.getOrDefault(nature, 1);                } else {                    v[i] -= wordCount.get(word) * weightOfNature.getOrDefault(nature, 1);                }            }        }        BigInteger fingerprint = BigInteger.ZERO;        for (int i = 0; i < hashbits; i++) {            if (v[i] > 0) {                fingerprint = fingerprint.add(BigInteger.ONE.shiftLeft(i));            }        }        return fingerprint;    }    private BigInteger hash(String source) {        if (source == null || source.isEmpty()) return BigInteger.ZERO;        StringBuilder sb = new StringBuilder();        while (sb.length() < 64) {            sb.append(source.charAt(0));            source = source.substring(1);        }        source = sb.toString();        char[] chars = source.toCharArray();        BigInteger x = BigInteger.valueOf((long) chars[0] << 7);        BigInteger m = new BigInteger("1000003");        BigInteger mask = m.pow(hashbits).subtract(BigInteger.ONE);        for (char c : chars) {            BigInteger temp = BigInteger.valueOf((long) c);            x = x.multiply(m).xor(temp).and(mask);        }        x = x.xor(BigInteger.valueOf(source.length()));        return x.equals(BigInteger.valueOf(-1)) ? BigInteger.valueOf(-2) : x;    }    public int hammingDistance(SimHashAlgorithm other) {        BigInteger m = BigInteger.ONE.shiftLeft(hashbits).subtract(BigInteger.ONE);        BigInteger x = strSimHash.xor(other.strSimHash).and(m);        int distance = 0;        while (x.signum() != 0) {            distance++;            x = x.and(x.subtract(BigInteger.ONE));        }        return distance;    }    public double getSemblance(SimHashAlgorithm s2) {        int distance = hammingDistance(s2);        return 1 - (distance / hashbits * 100);    }    public static String getPercentValue(double similarity) {        NumberFormat fmt = NumberFormat.getPercentInstance();        fmt.setMaximumFractionDigits(2);        return fmt.format(similarity);    }    public static void main(String[] args) {        String[] str1 = {"?????", "1234567890"};        String[] str2 = {"??????", "1234567890"};        SimHashAlgorithm hash1 = new SimHashAlgorithm(str1[0] + str1[1]);        SimHashAlgorithm hash2 = new SimHashAlgorithm(str2[0] + str2[1]);        double similarity = hash1.getSemblance(hash2);        System.out.println("?????" + getPercentValue(similarity) + "%");    }}