【Java】HiveUDF 中文字符串相似度

package udf;

import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
import org.testng.annotations.Test;

import java.util.Formatter;

public class Similar extends UDF {

public Text evaluate(Text astr, Text bstr) {
return new Text(new Formatter().format("%.2f", xiangsidu(astr.toString(), bstr.toString())).toString());

}

@Test
public void test() {
String rate = new Formatter().format("%.2f", xiangsidu("全世界无产者联合起来", "无产阶级")).toString();

System.out.println(rate);
}


public double xiangsidu(String a, String b) {
// 相似度公式:1 - 最小编辑距离/两个串中较长的长度
int n = a.length();

int m = b.length();
if (n < 1 && m < 1) {
return 0;

}
return 1 - minEditDist(a, b) / Math.max(n, m);

}

// 最小编辑距离计算
public double minEditDist(String a, String b) {
int n = a.length();

int m = b.length();
if (n < 1 || m < 1) {
return (n + m) * 1.00;

}

double[][] d = new double[n + 1][m + 1];

double cost;

for (int i = 0; i <= n; ++i)
d[i][0] = i;

for (int i = 0; i <= m; ++i)
d[0][i] = i;


for (int i = 1; i <= n; i++) {
for (int j = 1; j <= m; j++) {
if (a.charAt(i - 1) == b.charAt(j - 1))
cost = 0;

else
cost = 1;

d[i][j] = Minimum(d[i - 1][j] + 1, d[i][j - 1] + 1,
d[i - 1][j - 1] + cost);
}
}
return d[n][m];

}

private double Minimum(double a, double b, double c) {
double mi = a;

if (b < mi) {
mi = b;

}
if (c < mi) {
mi = c;

}
return mi;

}

}