G-Man's Uber Software Engineering Blog: Lucene sorting Strings

Lucene out of the box stores the whole String for each entry (or term) if you make it sort by a String column. In most cases you don't need that precision so if you for instance only want to sort by uppercase characters and can live with sorting only by the first six of them youc an cut down your memory consumption dramatically.

All you have to do is write some custom comparator and some custom SortComparator with the right source, too. Here is the code:


001 package com.sony.soe.platform.players.util;

002 

003 import java.io.IOException;

004 import java.util.Comparator;

005 

006 import org.apache.lucene.index.IndexReader;

007 import org.apache.lucene.index.Term;

008 import org.apache.lucene.index.TermDocs;

009 import org.apache.lucene.index.TermEnum;

010 import org.apache.lucene.search.ScoreDoc;

011 import org.apache.lucene.search.ScoreDocComparator;

012 import org.apache.lucene.search.SortComparatorSource;

013 import org.apache.lucene.search.SortField;

014 

015 import com.sony.soe.platform.players.exception.PlayersException;

016 

017 /**

018  * We have some special requirements regarding Strings so treat them special;-)

019  * @author geichberger

020  *

021  */

022 public class CustomComparator implements Comparator, SortComparatorSource {

023 

024   public  int compare(Object value1, Object value2) {

025      return staticCompare(value1, value2);

026   }

027   

028   public static int staticCompare(Object value1, Object value2) {

029     if(value1 == null && value2 == null){

030             return 0; 

031         }

032         //null is a smaller value than Not Null

033         if(value1 == null)

034             return -1;

035         if(value2 == null)

036             return 1;

037        

038         if ((value1 instanceof String) && (value2 instanceof String)) {

039            String s1 = ((String)value1).replaceAll("[\\W[_]]", ""); //remove all funny characters

040            String s2 = ((String)value2).replaceAll("[\\W[_]]", ""); //remove all funny characters

041            return s1.compareToIgnoreCase(s2);

042         } else if (value1 instanceof Comparable) {

043             return ((Comparable) value1).compareTo((Comparable) value2);

044         } 

045         return 0;

046   }

047 

048   public ScoreDocComparator newComparator(IndexReader arg0, String arg1) throws IOException {

049     return new FirstStageStringDocComparator(arg0, arg1);

050   }

051   

052   public static class FirstStageStringDocComparator implements ScoreDocComparator {

053     private int scores[];

054     

055     public FirstStageStringDocComparator() {

056       

057     }

058     

059     public FirstStageStringDocComparator(IndexReader reader, String fieldname) throws IOException, PlayersException {

060       final TermEnum enumerator = reader.terms(new Term(fieldname, ""));

061       scores = new int[reader.maxDoc()];

062       if (scores.length>0) {

063         TermDocs termDocs = reader.termDocs();

064         try {

065           if (enumerator.term() == null ) {

066             throw new PlayersException();

067           }

068           do {

069             Term term = enumerator.term();

070             if (!term.field().equalsIgnoreCase(fieldname)) break;

071             termDocs.seek(enumerator);

072             while (termDocs.next()) {

073               String s = term.text().replaceAll("[\\W[_[\\d]]]", "").toUpperCase(); //work our magic (also ignore numbers)

074               

075               int score = 0;

076               for (int i=0; i<6 && i<s.length(); i++) {

077                  //the score is the value of the first three characters base 26;-)

078                 //max score 26^3=17,576

079                 score += (s.charAt(i)-65)*Math.pow(26, 5-i); //this is some shortcut because we ignore numbers...

080               }

081               scores[termDocs.doc()] = score;

082             }

083           } while (enumerator.next());

084         } finally {

085           termDocs.close();

086         }

087       }

088     }

089     

090     public int compare(ScoreDoc i, ScoreDoc j) {

091       if (scores[i.doc] < scores[j.doc]) return -1;

092       if (scores[i.doc] > scores[j.doc]) return 1;

093       return 0;

094     }

095 

096     public int sortType() {

097       return SortField.INT;

098     }

099 

100     public Comparable sortValue(ScoreDoc i) {

101       return new Integer(scores[i.doc]);

102     }

103 

104     public int[] getScores() {

105       return scores;

106     }

107 

108     public void setScores(int[] scores) {

109       this.scores = scores;

110     }

111 

112   

113     

114   }

115 

116 }

Java2html

Then to use it all you have to do is use the following Sortfield in your query:new SortField(indexPart, new CustomComparator(), reverse) UPDATE: You will also need to add hashcode and equals to the CustomComperator class - otherwise you will create a memory leak.

2 Comments:

German said...

I had the pleasure to talk to the Lucene guys at ApacheCon and they promised me to add some way of carrying through the score they give during indexing so it can be used for sorting.

This wasn't really big in their agenda because they mostly are interested in sorting by relevance.

6:05 PM

Anonymous said...

Fuck... its impossible to read...

12:22 PM

<< Home

G-Man's Uber Software Engineering Blog

Friday, May 18, 2007

Lucene sorting Strings

2 Comments:

About Me

Previous Posts