C＃查找搜索结果显示的相关文档片段

在开发搜索我正在构建的网站时，我决定采用廉价而快捷的方式使用Microsoft Sql Server的全文搜索引擎，而不是像Lucene.Net那样更强大的东西。不过，我希望拥有的功能之一是google-esque相关文档片段。我很快发现确定“相关”片段比我意识到的更难。我想根据找到的文本中的搜索词密度选择片段。所以，基本上，我需要在文本中找到最密集的搜索词。通道是一些任意数量的字符（比如200 - 但它确实无关紧要）。我的第一个想法是在循环中使用.IndexOf（）并构建一个术语距离数组（从先前找到的术语中减去找到的术语的索引），然后......什么？将任意两个，任意三个，任意四个，任何五个顺序数组元素相加，并使用具有最小和的那个（因此，搜索项之间的最小距离）。这看起来很混乱。有没有一种既定的，更好的，更明显的方式来做到这一点，而不是我想出来的？

已邀请:

7 个回复

拟蓬

尽管它是用Java实现的，但您可以在此处看到针对该问题的一种方法： http://rcrezende.blogspot.com/2010/08/smallest-relevant-text-snippet-for.html

告耸

我知道这个线程已经过时了，但是上周我试了一下，背面很痛苦。这远非完美，但这是我想出的。代码段生成器：

public static string SelectKeywordSnippets(string StringToSnip, string[] Keywords, int SnippetLength)
    {
        string snippedString = "";
        List<int> keywordLocations = new List<int>();

        //Get the locations of all keywords
        for (int i = 0; i < Keywords.Count(); i++)
            keywordLocations.AddRange(SharedTools.IndexOfAll(StringToSnip, Keywords[i], StringComparison.CurrentCultureIgnoreCase));

        //Sort locations
        keywordLocations.Sort();

        //Remove locations which are closer to each other than the SnippetLength
        if (keywordLocations.Count > 1)
        {
            bool found = true;
            while (found)
            {
                found = false;
                for (int i = keywordLocations.Count - 1; i > 0; i--)
                    if (keywordLocations[i] - keywordLocations[i - 1] < SnippetLength / 2)
                    {
                        keywordLocations[i - 1] = (keywordLocations[i] + keywordLocations[i - 1]) / 2;

                        keywordLocations.RemoveAt(i);

                        found = true;
                    }
            }
        }

        //Make the snippets
        if (keywordLocations.Count > 0 && keywordLocations[0] - SnippetLength / 2 > 0)
            snippedString = "... ";
        foreach (int i in keywordLocations)
        {
            int stringStart = Math.Max(0, i - SnippetLength / 2);
            int stringEnd = Math.Min(i + SnippetLength / 2, StringToSnip.Length);
            int stringLength = Math.Min(stringEnd - stringStart, StringToSnip.Length - stringStart);
            snippedString += StringToSnip.Substring(stringStart, stringLength);
            if (stringEnd < StringToSnip.Length) snippedString += " ... ";
            if (snippedString.Length > 200) break;
        }

        return snippedString;

    }

该函数将查找示例文本中所有关键字的索引

 private static List<int> IndexOfAll(string haystack, string needle, StringComparison Comparison)
    {
        int pos;
        int offset = 0;
        int length = needle.Length;
        List<int> positions = new List<int>();
        while ((pos = haystack.IndexOf(needle, offset, Comparison)) != -1)
        {
            positions.Add(pos);
            offset = pos + length;
        }
        return positions;
    }

它的执行有点笨拙。它的工作方式是找到字符串中所有关键字的位置。然后检查没有关键字彼此之间的距离比所需的片段长度更近，这样片段就不会重叠（这就是它有点不合适......）。然后抓住以关键字位置为中心的所需长度的子串，并将整个事物拼接在一起。我知道这已经晚了几年，但发布以防万一它可能会帮助有人遇到这个问题。

亲奋漏

这是一个很好的问题:) 我想我会创建一个索引向量：对于每个单词，如果搜索项或其他为0，则创建一个条目1.然后找到i使得sum（indexvector [i：i + maxlength]）最大化。这实际上可以相当有效地完成。从第一个maxlength字中的searchterms数开始。然后，当你继续前进时，如果indexvector [i] = 1（即你增加i时即将丢失搜索项），减少你的计数器，如果indexvector [i + maxlength + 1] = 1，则增加它。随着时间的推移，跟踪具有最高计数器值的i。一旦你得到你最喜欢的我，你仍然可以做微调，看看你是否可以减少实际尺寸而不影响你的柜台，例如为了找到句子边界或其他什么。或者喜欢选择一些具有等效计数器值的正确i。不确定这是否比你的方法更好 - 它是一个不同的方法。您可能还想查看有关该主题的本文，该主题还有另一个基线：http：//citeseerx.ist.psu.edu/viewdoc/download？doi = 10.1.1.72.4357＆amp ;rep = rep1＆amp; type = PDF

畦桨存灯

public class Highlighter
{        
    private class Packet
    {
        public string Sentence;
        public double Density;
        public int Offset;
    }

    public static string FindSnippet(string text, string query, int maxLength)
    {
        if (maxLength < 0)
        {
            throw new ArgumentException("maxLength");
        }
        var words = query.Split(' ').Where(w => !string.IsNullOrWhiteSpace(w)).Select(word => word.ToLower()).ToLookup(s => s);             
        var sentences = text.Split('.');
        var i = 0;
        var packets = sentences.Select(sentence => new Packet 
        { 
            Sentence = sentence, 
            Density = ComputeDensity(words, sentence),
            Offset = i++
        }).OrderByDescending(packet => packet.Density);
        var list = new SortedList<int, string>();            
        int length = 0;                
        foreach (var packet in packets)
        {
            if (length >= maxLength || packet.Density == 0)
            {
                break;
            }
            string sentence = packet.Sentence;
            list.Add(packet.Offset, sentence.Substring(0, Math.Min(sentence.Length, maxLength - length)));
            length += packet.Sentence.Length;
        }
        var sb = new List<string>();
        int previous = -1;
        foreach (var item in list)
        {
            var offset = item.Key;
            var sentence = item.Value;
            if (previous != -1 && offset - previous != 1)
            {
                sb.Add(".");
            }
            previous = offset;             
            sb.Add(Highlight(sentence, words));                
        }
        return String.Join(".", sb);
    }

    private static string Highlight(string sentence, ILookup<string, string> words)
    {
        var sb = new List<string>();
        var ff = true;
        foreach (var word in sentence.Split(' '))
        {
            var token = word.ToLower();
            if (ff && words.Contains(token))
            {
                sb.Add("[[HIGHLIGHT]]");
                ff = !ff;
            }
            if (!ff && !string.IsNullOrWhiteSpace(token) && !words.Contains(token))
            {
                sb.Add("[[ENDHIGHLIGHT]]");
                ff = !ff;
            }
            sb.Add(word);
        }
        if (!ff)
        {
            sb.Add("[[ENDHIGHLIGHT]]");
        }
        return String.Join(" ", sb);
    }

    private static double ComputeDensity(ILookup<string, string> words, string sentence)
    {            
        if (string.IsNullOrEmpty(sentence) || words.Count == 0)
        {
            return 0;
        }
        int numerator = 0;
        int denominator = 0;
        foreach(var word in sentence.Split(' ').Select(w => w.ToLower()))
        {
            if (words.Contains(word))
            {
                numerator++;
            }
            denominator++;
        }
        if (denominator != 0)
        {
            return (double)numerator / denominator;
        }
        else
        {
            return 0;
        }
    }
}

例：突出显示“光学流动被定义为图像中结构光的变化，例如视网膜或相机的传感器，由于眼球或相机与场景之间的相对运动。文献中的进一步定义突出了光学流动的不同特性“”光流“ 输出： [[HIGHLIGHT]]光学流[[ENDHIGHLIGHT]]定义为结构化的变化图像中的光，e ...文献中的进一步定义突出了差异 [[HIGHLIGHT]]光流[[ENDHIGHLIGHT]]的特性

搁手

好吧，这是我使用上面描述的算法制作的黑客攻击版本。我认为这不是那么好。它使用三个（计数em，三个！）循环数组和两个列表。但是，它总比没有好。我还硬编码了最大长度，而不是将其变成参数。

private static string FindRelevantSnippets(string infoText, string[] searchTerms)
    {
        List<int> termLocations = new List<int>();
        foreach (string term in searchTerms)
        {
            int termStart = infoText.IndexOf(term);
            while (termStart > 0)
            {
                termLocations.Add(termStart);
                termStart = infoText.IndexOf(term, termStart + 1);
            }
        }

        if (termLocations.Count == 0)
        {
            if (infoText.Length > 250)
                return infoText.Substring(0, 250);
            else
                return infoText;
        }

        termLocations.Sort();

        List<int> termDistances = new List<int>();
        for (int i = 0; i < termLocations.Count; i++)
        {
            if (i == 0)
            {
                termDistances.Add(0);
                continue;
            }
            termDistances.Add(termLocations[i] - termLocations[i - 1]);
        }

        int smallestSum = int.MaxValue;
        int smallestSumIndex = 0;
        for (int i = 0; i < termDistances.Count; i++)
        {
            int sum = termDistances.Skip(i).Take(5).Sum();
            if (sum < smallestSum)
            {
                smallestSum = sum;
                smallestSumIndex = i;
            }
        }
        int start = Math.Max(termLocations[smallestSumIndex] - 128, 0);
        int len = Math.Min(smallestSum, infoText.Length - start);
        len = Math.Min(len, 250);
        return infoText.Substring(start, len);
    }

我能想到的一些改进是返回多个“片段”，其长度更短，加上更长的长度 - 这样就可以对文档的多个部分进行采样。

臀博

如果你使用CONTAINSTABLE你会得到一个RANK，这实际上是一个密度值 - RANK值越高，密度越高。这样，您只需运行查询即可获得所需的结果，而不必在返回数据时按摩数据。

赣借

写了一个函数来做到这一点。你想传入：输入：文件文字这是您正在摘录的文档的全文。您很可能希望从此文档中删除任何BBCode / HTML。原始查询用户输入的字符串作为搜索片段长度您要显示的代码段的长度。返回值：启动文档文本的索引以从中获取片段。要获得该片段，只需执行documentText.Substring(returnValue, snippetLength)即可。这样做的好处是你知道片段是从开始/结束/中间获取的，所以你可以添加一些装饰，如...，如果你希望在片段开始/结束。性能设置为1的resolution将找到最佳片段，但一次将窗口移动1个字符。将此值设置得更高可加快执行速度。调整你可以随心所欲地锻炼score。在这个例子中，我已经完成了Math.pow(wordLength, 2)以支持更长的单词。

private static int GetSnippetStartPoint(string documentText, string originalQuery, int snippetLength)
{
    // Normalise document text
    documentText = documentText.Trim();
    if (string.IsNullOrWhiteSpace(documentText)) return 0;

    // Return 0 if entire doc fits in snippet
    if (documentText.Length <= snippetLength) return 0;

    // Break query down into words
    var wordsInQuery = new HashSet<string>();
    {
        var queryWords = originalQuery.Split(' ');
        foreach (var word in queryWords)
        {
            var normalisedWord = word.Trim().ToLower();
            if (string.IsNullOrWhiteSpace(normalisedWord)) continue;
            if (wordsInQuery.Contains(normalisedWord)) continue;
            wordsInQuery.Add(normalisedWord);
        }
    }

    // Create moving window to get maximum trues
    var windowStart = 0;
    double maxScore = 0;
    var maxWindowStart = 0;

    // Higher number less accurate but faster
    const int resolution = 5;

    while (true)
    {
        var text = documentText.Substring(windowStart, snippetLength);

        // Get score of this chunk
        // This isn't perfect, as window moves in steps of resolution first and last words will be partial.
        // Could probably be improved to iterate words and not characters.
        var words = text.Split(' ').Select(c => c.Trim().ToLower());
        double score = 0;
        foreach (var word in words)
        {
            if (wordsInQuery.Contains(word))
            {
                // The longer the word, the more important.
                // Can simply replace with score += 1 for simpler model.
                score += Math.Pow(word.Length, 2);
            }                   
        }
        if (score > maxScore)
        {
            maxScore = score;
            maxWindowStart = windowStart;
        }

        // Setup next iteration
        windowStart += resolution;

        // Window end passed document end
        if (windowStart + snippetLength >= documentText.Length)
        {
            break;
        }
    }

    return maxWindowStart;
}

你可以添加更多，例如，你可能想尝试比较SOUNDEX，你的权重soundex匹配少于完全匹配。

要回复问题请先登录或注册

C＃查找搜索结果显示的相关文档片段

7 个回复

发起人

significance

问题状态

C＃查找搜索结果显示的相关文档片段

与内容相关的链接

7 个回复

发起人

significance

问题状态