Retraits des solitaires des annotable en une seule passe item puis user
Showing
1 changed file
with
66 additions
and
11 deletions
... | @@ -58,10 +58,12 @@ public class PreprocessingRunner implements ApplicationRunner { | ... | @@ -58,10 +58,12 @@ public class PreprocessingRunner implements ApplicationRunner { |
58 | setFilenames(); | 58 | setFilenames(); |
59 | List<AssociationElement> associationElements = loadAssociationElements(new File(dataDir, completeFilename)); | 59 | List<AssociationElement> associationElements = loadAssociationElements(new File(dataDir, completeFilename)); |
60 | // associationElements = cleanupSmallCounts(associationElements, 1, 1); | 60 | // associationElements = cleanupSmallCounts(associationElements, 1, 1); |
61 | - List<Integer> annotateIndexes = chooseAnnotated(associationElements, 1, 1); | 61 | + List<AssociationElement> annotableElements = removeFirstSmallCounts(associationElements, 1, 1); |
62 | - writeSampleAndAnnotated(new File(dataDir, sampleFilename), new File(dataDir, annontatedFilename), annotateIndexes, associationElements); | 62 | + List<Integer> annotateIndexes = chooseAnnotated(associationElements, annotableElements, 1, 1); |
63 | + writeSampleAndAnnotated(new File(dataDir, sampleFilename), new File(dataDir, annontatedFilename), annotateIndexes, associationElements, annotableElements); | ||
63 | } | 64 | } |
64 | 65 | ||
66 | + // TODO retirer duplication de code entre cleanupSmallCounts et removeFirstSmallCounts | ||
65 | private List<AssociationElement> cleanupSmallCounts(List<AssociationElement> associationElements, int userSize, int itemSize) { | 67 | private List<AssociationElement> cleanupSmallCounts(List<AssociationElement> associationElements, int userSize, int itemSize) { |
66 | 68 | ||
67 | boolean removedUser; | 69 | boolean removedUser; |
... | @@ -114,9 +116,59 @@ public class PreprocessingRunner implements ApplicationRunner { | ... | @@ -114,9 +116,59 @@ public class PreprocessingRunner implements ApplicationRunner { |
114 | return associationElements; | 116 | return associationElements; |
115 | } | 117 | } |
116 | 118 | ||
117 | - private List<Integer> chooseAnnotated(List<AssociationElement> associationElements, int userSize, int itemSize) { | 119 | + private List<AssociationElement> removeFirstSmallCounts(List<AssociationElement> associationElements, int userSize, int itemSize) { |
120 | + | ||
121 | + boolean removedUser; | ||
122 | + boolean removedItem; | ||
123 | + long userCount; | ||
124 | + long itemCount; | ||
125 | + Set<Long> itemIdSet; | ||
126 | + Set<Long> userIdSet; | ||
127 | + | ||
128 | + removedUser = false; | ||
129 | + removedItem = false; | ||
130 | + | ||
131 | + // Books or ratings are more alone than users, so we start with them | ||
132 | + itemIdSet = associationElements.stream().map(element -> element.getItemId()).collect(Collectors.toSet()); | ||
133 | + for (Long itemId : itemIdSet) { | ||
134 | + userCount = associationElements.stream().filter(element -> element.getItemId() == itemId).count(); | ||
135 | + if (userCount <= userSize) { | ||
136 | + associationElements = associationElements.stream().filter(element -> element.getItemId() != itemId).collect(Collectors.toList()); | ||
137 | + if (!removedItem) { | ||
138 | + removedItem = true; | ||
139 | + logger.debug("Removed first item"); | ||
140 | + } | ||
141 | + logger.trace("Removed item {}", itemId); | ||
142 | + } | ||
143 | + | ||
144 | + } | ||
145 | + | ||
146 | + logger.debug("Remaining AssociationElement count {}", associationElements.size()); | ||
147 | + | ||
148 | + // Then we remove users | ||
149 | + userIdSet = associationElements.stream().map(element -> element.getUserId()).collect(Collectors.toSet()); | ||
150 | + for (Long userId : userIdSet) { | ||
151 | + itemCount = associationElements.stream().filter(element -> element.getUserId() == userId).count(); | ||
152 | + if (itemCount <= itemSize) { | ||
153 | + associationElements = associationElements.stream().filter(element -> element.getUserId() != userId).collect(Collectors.toList()); | ||
154 | + if (!removedUser) { | ||
155 | + removedUser = true; | ||
156 | + logger.debug("Removed first user"); | ||
157 | + } | ||
158 | + logger.trace("Removed user {}", userId); | ||
159 | + } | ||
160 | + } | ||
161 | + | ||
162 | + logger.debug("Remaining AssociationElement count {}", associationElements.size()); | ||
163 | + | ||
164 | + logger.debug("Remover item or user {}", removedUser || removedItem); | ||
165 | + | ||
166 | + return associationElements; | ||
167 | + } | ||
168 | + | ||
169 | + private List<Integer> chooseAnnotated(List<AssociationElement> annotableElements, List<AssociationElement> associationElements, int userSize, int itemSize) { | ||
118 | List<Integer> annotatedChosen = new ArrayList<>(); | 170 | List<Integer> annotatedChosen = new ArrayList<>(); |
119 | - int size = associationElements.size(); | 171 | + int size = annotableElements.size(); |
120 | long userCount = 0; | 172 | long userCount = 0; |
121 | long itemCount = 0; | 173 | long itemCount = 0; |
122 | AssociationElement randomAssociationElement; | 174 | AssociationElement randomAssociationElement; |
... | @@ -128,16 +180,17 @@ public class PreprocessingRunner implements ApplicationRunner { | ... | @@ -128,16 +180,17 @@ public class PreprocessingRunner implements ApplicationRunner { |
128 | randomInteger = new Integer(random.nextInt(size)); | 180 | randomInteger = new Integer(random.nextInt(size)); |
129 | 181 | ||
130 | if (!annotatedChosen.contains(randomInteger)) { | 182 | if (!annotatedChosen.contains(randomInteger)) { |
131 | - randomAssociationElement = associationElements.get(randomInteger); | 183 | + randomAssociationElement = annotableElements.get(randomInteger); |
132 | final Long itemId = randomAssociationElement.getItemId(); | 184 | final Long itemId = randomAssociationElement.getItemId(); |
133 | final Long userId = randomAssociationElement.getUserId(); | 185 | final Long userId = randomAssociationElement.getUserId(); |
134 | userCount = associationElements.stream().filter(element -> element.getItemId() == itemId).count(); | 186 | userCount = associationElements.stream().filter(element -> element.getItemId() == itemId).count(); |
135 | itemCount = associationElements.stream().filter(element -> element.getUserId() == userId).count(); | 187 | itemCount = associationElements.stream().filter(element -> element.getUserId() == userId).count(); |
188 | + logger.trace("Checking new AssociationElement for annotation"); | ||
136 | 189 | ||
137 | // Decreasing values based on planned suppressions | 190 | // Decreasing values based on planned suppressions |
138 | // TODO Refactor writeSampleAndAnnotated and chooseAnnotated to avoid this | 191 | // TODO Refactor writeSampleAndAnnotated and chooseAnnotated to avoid this |
139 | for (Integer annotatedIndex : annotatedChosen) { | 192 | for (Integer annotatedIndex : annotatedChosen) { |
140 | - checkingAssociationElement = associationElements.get(annotatedIndex); | 193 | + checkingAssociationElement = annotableElements.get(annotatedIndex); |
141 | if (checkingAssociationElement.getUserId() == userId) { | 194 | if (checkingAssociationElement.getUserId() == userId) { |
142 | userCount--; | 195 | userCount--; |
143 | } | 196 | } |
... | @@ -148,6 +201,7 @@ public class PreprocessingRunner implements ApplicationRunner { | ... | @@ -148,6 +201,7 @@ public class PreprocessingRunner implements ApplicationRunner { |
148 | 201 | ||
149 | if (userCount > userSize && itemCount > itemSize) { | 202 | if (userCount > userSize && itemCount > itemSize) { |
150 | annotatedChosen.add(randomInteger); | 203 | annotatedChosen.add(randomInteger); |
204 | + logger.debug("Adding new AssociationElement to annotated, total is {}", annotatedChosen.size()); | ||
151 | } | 205 | } |
152 | } | 206 | } |
153 | } | 207 | } |
... | @@ -155,9 +209,10 @@ public class PreprocessingRunner implements ApplicationRunner { | ... | @@ -155,9 +209,10 @@ public class PreprocessingRunner implements ApplicationRunner { |
155 | return annotatedChosen; | 209 | return annotatedChosen; |
156 | } | 210 | } |
157 | 211 | ||
158 | - private void writeSampleAndAnnotated(File sampleFile, File annotatedFile, List<Integer> annotateIndexes, List<AssociationElement> associationElements) throws PreprocessingException { | 212 | + private void writeSampleAndAnnotated(File sampleFile, File annotatedFile, List<Integer> annotateIndexes, List<AssociationElement> associationElements, List<AssociationElement> annotableElements) throws PreprocessingException { |
159 | try { | 213 | try { |
160 | AssociationElement associationElement; | 214 | AssociationElement associationElement; |
215 | + Integer annotableIndex; | ||
161 | if (ratings) { | 216 | if (ratings) { |
162 | RatingElement ratingElement; | 217 | RatingElement ratingElement; |
163 | CSVFormat ratingsFormat = CSVFormat.TDF.withHeader("itemId", "userId", "rating"); | 218 | CSVFormat ratingsFormat = CSVFormat.TDF.withHeader("itemId", "userId", "rating"); |
... | @@ -166,8 +221,8 @@ public class PreprocessingRunner implements ApplicationRunner { | ... | @@ -166,8 +221,8 @@ public class PreprocessingRunner implements ApplicationRunner { |
166 | 221 | ||
167 | for (int i = 0; i < associationElements.size(); i++) { | 222 | for (int i = 0; i < associationElements.size(); i++) { |
168 | ratingElement = (RatingElement) associationElements.get(i); | 223 | ratingElement = (RatingElement) associationElements.get(i); |
169 | - Integer index = new Integer(i); | 224 | + annotableIndex = new Integer(annotableElements.indexOf(ratingElement)); |
170 | - if (annotateIndexes.contains(index)) { | 225 | + if (annotableIndex >=0 && annotateIndexes.contains(annotableIndex)) { |
171 | annotatedPrinter.printRecord(ratingElement.getItemId(), ratingElement.getUserId(), ratingElement.getRating()); | 226 | annotatedPrinter.printRecord(ratingElement.getItemId(), ratingElement.getUserId(), ratingElement.getRating()); |
172 | } else { | 227 | } else { |
173 | samplePrinter.printRecord(ratingElement.getItemId(), ratingElement.getUserId(), ratingElement.getRating()); | 228 | samplePrinter.printRecord(ratingElement.getItemId(), ratingElement.getUserId(), ratingElement.getRating()); |
... | @@ -183,8 +238,8 @@ public class PreprocessingRunner implements ApplicationRunner { | ... | @@ -183,8 +238,8 @@ public class PreprocessingRunner implements ApplicationRunner { |
183 | 238 | ||
184 | for (int i = 0; i < associationElements.size(); i++) { | 239 | for (int i = 0; i < associationElements.size(); i++) { |
185 | associationElement = associationElements.get(i); | 240 | associationElement = associationElements.get(i); |
186 | - Integer index = new Integer(i); | 241 | + annotableIndex = new Integer(annotableElements.indexOf(associationElement)); |
187 | - if (annotateIndexes.contains(index)) { | 242 | + if (annotableIndex >=0 && annotateIndexes.contains(annotableIndex)) { |
188 | annotatedPrinter.printRecord(associationElement.getItemId(), associationElement.getUserId()); | 243 | annotatedPrinter.printRecord(associationElement.getItemId(), associationElement.getUserId()); |
189 | } else { | 244 | } else { |
190 | samplePrinter.printRecord(associationElement.getItemId(), associationElement.getUserId()); | 245 | samplePrinter.printRecord(associationElement.getItemId(), associationElement.getUserId()); | ... | ... |
-
Please register or login to post a comment