Java实现哈希（相似度）算法，用于试题相似度，字符串相似度等场景-白红宇

Java实现哈希（相似度）算法，用于试题相似度，字符串相似度等场景

阅读量：117 次

发布时间：2019-02-27

本文共 4768 字，大约阅读时间需要 15 分钟。

一、哈希（相似度）算法原理

借鉴hashmap算法找出可以hash的key值，因为我们使用的simhash是局部敏感哈希，这个算法的特点是只要相似的字符串只有个别的位数是有差别变化。那这样我们可以推断两个相似的文本，至少有16位的simhash是一样的。具体选择16位、8位、4位，大家根据自己的数据测试选择。1、分词，把需要判断文本分词形成这个文章的特征单词。2、hash，通过hash算法把每个词变成hash值， 比如“美国”通过hash算法计算为 100101, “51区”通过hash算法计算为 101011。3、加权，通过 2步骤的hash生成结果，需要按照单词的权重形成加权数字串4、合并，把上面各个单词算出来的序列值累加，变成只有一个序列串。5、降维，把4步算出来的 “9 -9 1 -1 1 9” 变成 0 1 串，形成我们最终的simhash签名。

二、哈希（相似度）算法实现

public class SimHashAlgorithm {    private String tokens; //字符串    private BigInteger strSimHash;//字符产的hash值    private int hashbits = 64; // 分词后的hash数;    public SimHashAlgorithm(String tokens) {        this.tokens = tokens;        this.strSimHash = this.simHash();    }    private SimHashAlgorithm(String tokens, int hashbits) {        this.tokens = tokens;        this.hashbits = hashbits;        this.strSimHash = this.simHash();    }    /**     * 清除html标签     * @param content     * @return     */    private String cleanResume(String content) {        // 若输入为HTML,下面会过滤掉所有的HTML的tag        content = Jsoup.clean(content, Whitelist.none());        content = StringUtils.lowerCase(content);        String[] strings = {" ", "\n", "\r", "\t", "\\r", "\\n", "\\t", " "};        for (String s : strings) {            content = content.replaceAll(s, "");        }        return content;    }    /**     * 这个是对整个字符串进行hash计算     * @return     */    private BigInteger simHash() {        tokens = cleanResume(tokens); // cleanResume 删除一些特殊字符        int[] v = new int[this.hashbits];        List
    
      termList = StandardTokenizer.segment(this.tokens); // 对字符串进行分词        //对分词的一些特殊处理 : 比如: 根据词性添加权重 , 过滤掉标点符号 , 过滤超频词汇等;        Map
     
       weightOfNature = new HashMap
      
       (); // 词性的权重        weightOfNature.put("n", 2); //给名词的权重是2;        Map
       
         stopNatures = new HashMap
        
         ();//停用的词性 如一些标点符号之类的;        stopNatures.put("w", ""); //        int overCount = 5; //设定超频词汇的界限 ;        Map
         
           wordCount = new HashMap
          
           (); for (Term term : termList) { String word = term.word; //分词字符串 String nature = term.nature.toString(); // 分词属性; // 过滤超频词 if (wordCount.containsKey(word)) { int count = wordCount.get(word); if (count > overCount) { continue; } wordCount.put(word, count + 1); } else { wordCount.put(word, 1); } // 过滤停用词性 if (stopNatures.containsKey(nature)) { continue; } // 2、将每一个分词hash为一组固定长度的数列.比如 64bit 的一个整数. BigInteger t = this.hash(word); for (int i = 0; i < this.hashbits; i++) { BigInteger bitmask = new BigInteger("1").shiftLeft(i); // 3、建立一个长度为64的整数数组(假设要生成64位的数字指纹,也可以是其它数字), // 对每一个分词hash后的数列进行判断,如果是1000...1,那么数组的第一位和末尾一位加1, // 中间的62位减一,也就是说,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕. int weight = 1; //添加权重 if (weightOfNature.containsKey(nature)) { weight = weightOfNature.get(nature); } if (t.and(bitmask).signum() != 0) { // 这里是计算整个文档的所有特征的向量和 v[i] += weight; } else { v[i] -= weight; } } } BigInteger fingerprint = new BigInteger("0"); for (int i = 0; i < this.hashbits; i++) { if (v[i] >= 0) { fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i)); } } return fingerprint; } /** * 对单个的分词进行hash计算; * @param source * @return */ private BigInteger hash(String source) { if (source == null || source.length() == 0) { return new BigInteger("0"); } else { /** * 当sourece 的长度过短，会导致hash算法失效，因此需要对过短的词补偿 */ while (source.length() < 3) { source = source + source.charAt(0); } char[] sourceArray = source.toCharArray(); BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7); BigInteger m = new BigInteger("1000003"); BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(new BigInteger("1")); for (char item : sourceArray) { BigInteger temp = BigInteger.valueOf((long) item); x = x.multiply(m).xor(temp).and(mask); } x = x.xor(new BigInteger(String.valueOf(source.length()))); if (x.equals(new BigInteger("-1"))) { x = new BigInteger("-2"); } return x; } } /** * 计算海明距离,海明距离越小说明越相似; * @param other * @return */ private int hammingDistance(SimHashAlgorithm other) { BigInteger m = new BigInteger("1").shiftLeft(this.hashbits).subtract( new BigInteger("1")); BigInteger x = this.strSimHash.xor(other.strSimHash).and(m); int tot = 0; while (x.signum() != 0) { tot += 1; x = x.and(x.subtract(new BigInteger("1"))); } return tot; } public double getSemblance(SimHashAlgorithm s2 ){ double i = (double) this.hammingDistance(s2); return 1 - i/this.hashbits ; } public static String getPercentValue( double similarity){ NumberFormat fmt = NumberFormat.getPercentInstance(); fmt.setMaximumFractionDigits(2);//最多两位百分小数，如25.23% return fmt.format(similarity); } public static void main(String[] args) { //要比较的两个字符串 String[] str1 = {"今天星期四","1234567890"}; String[] str2 = {"今天是星期五","1234567890"}; for(int i=0;i