Lucene out of the box stores the whole String for each entry (or term) if you make it sort by a String column. In most cases you don't need that precision so if you for instance only want to sort by uppercase characters and can live with sorting only by the first six of them youc an cut down your memory consumption dramatically.
All you have to do is write some custom comparator and some custom SortComparator with the right source, too. Here is the code:
001 package com.sony.soe.platform.players.util;
002
003 import java.io.IOException;
004 import java.util.Comparator;
005
006 import org.apache.lucene.index.IndexReader;
007 import org.apache.lucene.index.Term;
008 import org.apache.lucene.index.TermDocs;
009 import org.apache.lucene.index.TermEnum;
010 import org.apache.lucene.search.ScoreDoc;
011 import org.apache.lucene.search.ScoreDocComparator;
012 import org.apache.lucene.search.SortComparatorSource;
013 import org.apache.lucene.search.SortField;
014
015 import com.sony.soe.platform.players.exception.PlayersException;
016
017 /**
018 * We have some special requirements regarding Strings so treat them special;-)
019 * @author geichberger
020 *
021 */
022 public class CustomComparator implements Comparator, SortComparatorSource {
023
024 public int compare(Object value1, Object value2) {
025 return staticCompare(value1, value2);
026 }
027
028 public static int staticCompare(Object value1, Object value2) {
029 if(value1 == null && value2 == null){
030 return 0;
031 }
032 //null is a smaller value than Not Null
033 if(value1 == null)
034 return -1;
035 if(value2 == null)
036 return 1;
037
038 if ((value1 instanceof String) && (value2 instanceof String)) {
039 String s1 = ((String)value1).replaceAll("[\\W[_]]", ""); //remove all funny characters
040 String s2 = ((String)value2).replaceAll("[\\W[_]]", ""); //remove all funny characters
041 return s1.compareToIgnoreCase(s2);
042 } else if (value1 instanceof Comparable) {
043 return ((Comparable) value1).compareTo((Comparable) value2);
044 }
045 return 0;
046 }
047
048 public ScoreDocComparator newComparator(IndexReader arg0, String arg1) throws IOException {
049 return new FirstStageStringDocComparator(arg0, arg1);
050 }
051
052 public static class FirstStageStringDocComparator implements ScoreDocComparator {
053 private int scores[];
054
055 public FirstStageStringDocComparator() {
056
057 }
058
059 public FirstStageStringDocComparator(IndexReader reader, String fieldname) throws IOException, PlayersException {
060 final TermEnum enumerator = reader.terms(new Term(fieldname, ""));
061 scores = new int[reader.maxDoc()];
062 if (scores.length>0) {
063 TermDocs termDocs = reader.termDocs();
064 try {
065 if (enumerator.term() == null ) {
066 throw new PlayersException();
067 }
068 do {
069 Term term = enumerator.term();
070 if (!term.field().equalsIgnoreCase(fieldname)) break;
071 termDocs.seek(enumerator);
072 while (termDocs.next()) {
073 String s = term.text().replaceAll("[\\W[_[\\d]]]", "").toUpperCase(); //work our magic (also ignore numbers)
074
075 int score = 0;
076 for (int i=0; i<6 && i<s.length(); i++) {
077 //the score is the value of the first three characters base 26;-)
078 //max score 26^3=17,576
079 score += (s.charAt(i)-65)*Math.pow(26, 5-i); //this is some shortcut because we ignore numbers...
080 }
081 scores[termDocs.doc()] = score;
082 }
083 } while (enumerator.next());
084 } finally {
085 termDocs.close();
086 }
087 }
088 }
089
090 public int compare(ScoreDoc i, ScoreDoc j) {
091 if (scores[i.doc] < scores[j.doc]) return -1;
092 if (scores[i.doc] > scores[j.doc]) return 1;
093 return 0;
094 }
095
096 public int sortType() {
097 return SortField.INT;
098 }
099
100 public Comparable sortValue(ScoreDoc i) {
101 return new Integer(scores[i.doc]);
102 }
103
104 public int[] getScores() {
105 return scores;
106 }
107
108 public void setScores(int[] scores) {
109 this.scores = scores;
110 }
111
112
113
114 }
115
116 }
|
Java2html
|
Then to use it all you have to do is use the following Sortfield in your query:
new SortField(indexPart, new CustomComparator(), reverse)
UPDATE: You will also need to add hashcode and equals to the CustomComperator class - otherwise you will create a memory leak.
2 Comments:
I had the pleasure to talk to the Lucene guys at ApacheCon and they promised me to add some way of carrying through the score they give during indexing so it can be used for sorting.
This wasn't really big in their agenda because they mostly are interested in sorting by relevance.
6:05 PM
Fuck... its impossible to read...
12:22 PM
Post a Comment
<< Home