Lucene out of the box stores the whole String for each entry (or term) if you make it sort by a String column. In most cases you don't need that precision so if you for instance only want to sort by uppercase characters and can live with sorting only by the first six of them youc an cut down your memory consumption dramatically.
All you have to do is write some custom comparator and some custom SortComparator with the right source, too. Here is the code:
  
 
  
 001 package com.sony.soe.platform.players.util;
 
 002
 
 003 import java.io.IOException;
 
 004 import java.util.Comparator;
 
 005
 
 006 import org.apache.lucene.index.IndexReader;
 
 007 import org.apache.lucene.index.Term;
 
 008 import org.apache.lucene.index.TermDocs;
 
 009 import org.apache.lucene.index.TermEnum;
 
 010 import org.apache.lucene.search.ScoreDoc;
 
 011 import org.apache.lucene.search.ScoreDocComparator;
 
 012 import org.apache.lucene.search.SortComparatorSource;
 
 013 import org.apache.lucene.search.SortField;
 
 014
 
 015 import com.sony.soe.platform.players.exception.PlayersException;
 
 016
 
 017 /**
 
 018  * We have some special requirements regarding Strings so treat them special;-)
 
 019  * @author geichberger
 
 020  *
 
 021  */
 
 022 public class CustomComparator implements Comparator, SortComparatorSource {
 
 023
 
 024   public  int compare(Object value1, Object value2) {
 
 025      return staticCompare(value1, value2);
 
 026   }
 
 027
 
 028   public static int staticCompare(Object value1, Object value2) {
 
 029     if(value1 == null && value2 == null){
 
 030             return 0;
 
 031         }
 
 032         //null is a smaller value than Not Null
 
 033         if(value1 == null)
 
 034             return -1;
 
 035         if(value2 == null)
 
 036             return 1;
 
 037
 
 038         if ((value1 instanceof String) && (value2 instanceof String)) {
 
 039            String s1 = ((String)value1).replaceAll("[\\W[_]]", ""); //remove all funny characters
 
 040            String s2 = ((String)value2).replaceAll("[\\W[_]]", ""); //remove all funny characters
 
 041            return s1.compareToIgnoreCase(s2);
 
 042         } else if (value1 instanceof Comparable) {
 
 043             return ((Comparable) value1).compareTo((Comparable) value2);
 
 044         }
 
 045         return 0;
 
 046   }
 
 047
 
 048   public ScoreDocComparator newComparator(IndexReader arg0, String arg1) throws IOException {
 
 049     return new FirstStageStringDocComparator(arg0, arg1);
 
 050   }
 
 051
 
 052   public static class FirstStageStringDocComparator implements ScoreDocComparator {
 
 053     private int scores[];
 
 054
 
 055     public FirstStageStringDocComparator() {
 
 056
 
 057     }
 
 058
 
 059     public FirstStageStringDocComparator(IndexReader reader, String fieldname) throws IOException, PlayersException {
 
 060       final TermEnum enumerator = reader.terms(new Term(fieldname, ""));
 
 061       scores = new int[reader.maxDoc()];
 
 062       if (scores.length>0) {
 
 063         TermDocs termDocs = reader.termDocs();
 
 064         try {
 
 065           if (enumerator.term() == null ) {
 
 066             throw new PlayersException();
 
 067           }
 
 068           do {
 
 069             Term term = enumerator.term();
 
 070             if (!term.field().equalsIgnoreCase(fieldname)) break;
 
 071             termDocs.seek(enumerator);
 
 072             while (termDocs.next()) {
 
 073               String s = term.text().replaceAll("[\\W[_[\\d]]]", "").toUpperCase(); //work our magic (also ignore numbers)
 
 074
 
 075               int score = 0;
 
 076               for (int i=0; i<6 && i<s.length(); i++) {
 
 077                  //the score is the value of the first three characters base 26;-)
 
 078                 //max score 26^3=17,576
 
 079                 score += (s.charAt(i)-65)*Math.pow(26, 5-i); //this is some shortcut because we ignore numbers...
 
 080               }
 
 081               scores[termDocs.doc()] = score;
 
 082             }
 
 083           } while (enumerator.next());
 
 084         } finally {
 
 085           termDocs.close();
 
 086         }
 
 087       }
 
 088     }
 
 089
 
 090     public int compare(ScoreDoc i, ScoreDoc j) {
 
 091       if (scores[i.doc] < scores[j.doc]) return -1;
 
 092       if (scores[i.doc] > scores[j.doc]) return 1;
 
 093       return 0;
 
 094     }
 
 095
 
 096     public int sortType() {
 
 097       return SortField.INT;
 
 098     }
 
 099
 
 100     public Comparable sortValue(ScoreDoc i) {
 
 101       return new Integer(scores[i.doc]);
 
 102     }
 
 103
 
 104     public int[] getScores() {
 
 105       return scores;
 
 106     }
 
 107
 
 108     public void setScores(int[] scores) {
 
 109       this.scores = scores;
 
 110     }
 
 111
 
 112
 
 113
 
 114   }
 
 115
 
 116 }
 
 | 
 
  
 
  
   
 Java2html
 
 
 | 
  
 
Then to use it all you have to do is use the following Sortfield in your query:
new SortField(indexPart, new CustomComparator(), reverse)
UPDATE: You will also need to add hashcode and equals to the CustomComperator class - otherwise you will create a memory leak.
    
     
    
    
  
  
2 Comments:
I had the pleasure to talk to the Lucene guys at ApacheCon and they promised me to add some way of carrying through the score they give during indexing so it can be used for sorting.
This wasn't really big in their agenda because they mostly are interested in sorting by relevance.
6:05 PM
Fuck... its impossible to read...
12:22 PM
Post a Comment
<< Home