Cleanup and testing

This commit is contained in:
openaudible 2018-05-28 17:39:21 -07:00
parent 8bcd07bb0b
commit 09d8ca3f2f
4 changed files with 9 additions and 88 deletions

View file

@ -1,52 +1,23 @@
package org.openaudible.audible; package org.openaudible.audible;
import com.gargoylesoftware.htmlunit.html.DomNode; import com.gargoylesoftware.htmlunit.html.DomNode;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.google.gson.JsonArray;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.json.JSONArray; import org.json.JSONArray;
import org.json.JSONObject; import org.json.JSONObject;
import org.openaudible.books.Book; import org.openaudible.books.Book;
import org.openaudible.books.BookElement; import org.openaudible.books.BookElement;
import org.openaudible.util.HTMLUtil;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
public enum BookPageParser { public enum BookPageParser {
instance; instance; // Singleton
private static final Log LOG = LogFactory.getLog(BookPageParser.class); private static final Log LOG = LogFactory.getLog(BookPageParser.class);
public String extract(String c, DomNode h) {
return HTMLUtil.text(HTMLUtil.findByClass(c, h));
}
public String extractParagraph(String c, DomNode h) {
String out = "";
DomNode node = (DomNode) HTMLUtil.findByClass(c, h);
if (node != null) {
NodeList cn = node.getChildNodes();
for (int x = 0; x < cn.getLength(); x++) {
Node y = cn.item(x);
String text = y.getTextContent();
if (text != null) {
text = text.trim();
if (out.length() > 0)
out += "\n";
out += text;
}
}
}
return out;
}
// audible uses a lot of cdata. It is useful.
List<String> getCDATATags(String html) List<String> getCDATATags(String html)
{ {
ArrayList<String> list = new ArrayList<>(); ArrayList<String> list = new ArrayList<>();
@ -72,7 +43,7 @@ public enum BookPageParser {
public boolean parseBookPage(HtmlPage page, Book b) { public boolean parseBookPage(HtmlPage page, Book b) {
DomNode h = page; DomNode h = page;
HTMLUtil.debugNode(page, "book_info"); // HTMLUtil.debugNode(page, "book_info");
String xml = page.asXml(); String xml = page.asXml();
List<String> cdataList = getCDATATags(xml); List<String> cdataList = getCDATATags(xml);
for (String cd:cdataList) for (String cd:cdataList)
@ -102,33 +73,7 @@ public enum BookPageParser {
} }
/* // right now we only care about the @type AudioBook meta data.
"image": "https://m.media-amazon.com/images/I/51u1om96bmL._SL500_.jpg",
"@type": "Audiobook",
"author": [{
"@type": "Person",
"name": "Amanda Hodgkinson"
}],
"readBy": [{
"@type": "Person",
"name": "Robin Sachs"
}],
"description": "<p>A tour de force that echoes modern classics like <i>Suite Francaise<\/i> and <i>The Postmistress<\/i>. <\/p><p>\"Housekeeper or housewife?\" the soldier asks Silvana as she and eight-year-old Aurek board the ship that will take them from Poland to England at the end of World War II. There her husband, Janusz, is already waiting for them at the little house at 22 Britannia Road. But the war has changed them all so utterly that they'll barely recognize one another when they are reunited. \"Survivor,\" she answers.<\/p><p>Silvana and Aurek spent the war hiding in the forests of Poland. Wild, almost feral Aurek doesn't know how to tie his own shoes or sleep in a bed. Janusz is an Englishman now-determined to forget Poland, forget his own ghosts from the way, and begin a new life as a proper English family. But for Silvana, who cannot escape the painful memory of a shattering wartime act, forgetting is not a possibility.<\/p>",
"abridged": "false",
"inLanguage": "english",
"bookFormat": "AudiobookFormat",
"@context": "https://schema.org",
"datePublished": "2011-04-28",
"duration": "PT11H19M",
"name": "22 Britannia Road",
"publisher": "Penguin Audio",
"aggregateRating": {
"@type": "AggregateRating",
"ratingValue": "3.6842105263157894",
"ratingCount": "171"
}
*/
private void extractFromJSON(JSONObject obj, Book b) { private void extractFromJSON(JSONObject obj, Book b) {
String typ = obj.optString("@type"); String typ = obj.optString("@type");
if (typ == null || typ.isEmpty()) if (typ == null || typ.isEmpty())
@ -136,14 +81,12 @@ public enum BookPageParser {
if (!"AudioBook".equalsIgnoreCase(typ)) // && !"Product".equalsIgnoreCase(typ)) if (!"AudioBook".equalsIgnoreCase(typ)) // && !"Product".equalsIgnoreCase(typ))
return; return;
LOG.info(obj.toString(2)); // LOG.info(obj.toString(2));
for (String k:obj.keySet()) for (String k:obj.keySet())
{ {
System.out.println(k+" = "+ obj.get(k));
Object value = obj.get(k); Object value = obj.get(k);
String str = value!=null ? value.toString():""; String str = value!=null ? value.toString():"";
BookElement elem = null; BookElement elem = null;
switch(k) switch(k)
@ -195,7 +138,7 @@ public enum BookPageParser {
elem = BookElement.publisher; elem = BookElement.publisher;
break; break;
default: default:
LOG.info("Skipping "+k+" = "+ str); // LOG.info("Skipping "+k+" = "+ str);
break; break;
} }
@ -206,12 +149,7 @@ public enum BookPageParser {
b.set(elem, str); b.set(elem, str);
} }
} }
} }
} }
@ -224,6 +162,7 @@ public enum BookPageParser {
// "name": "Robin Racer" // "name": "Robin Racer"
// }], // }],
// array of 'person' objects.
private String personToString(JSONArray arr) { private String personToString(JSONArray arr) {
String out = ""; String out = "";
for (int x=0;x<arr.length();x++) for (int x=0;x<arr.length();x++)
@ -238,7 +177,6 @@ public enum BookPageParser {
out += name; out += name;
} }
} }
return out; return out;
} }

View file

@ -124,7 +124,7 @@ public enum LibraryParser {
if (r.getCells().size() != BookColumns.size()) { if (r.getCells().size() != BookColumns.size()) {
LOG.error("wrong number of columns found: " + r.getCells().size() + " != " + BookColumns.size()); LOG.error("wrong number of columns found: " + r.getCells().size() + " != " + BookColumns.size());
LOG.error(xml); LOG.error(xml);
HTMLUtil.debugNode(r, "bad_col.xml"); if (debug) HTMLUtil.debugNode(r, "bad_col.xml");
return null; return null;
} }
@ -201,7 +201,7 @@ public enum LibraryParser {
if (text.contains("by parts")) if (text.contains("by parts"))
{ {
LOG.error("error with title: "+text); LOG.error("error with title: "+text);
HTMLUtil.debugNode(cell, col.name()+".xml"); if (debug) HTMLUtil.debugNode(cell, col.name()+".xml");
// bug check. // bug check.
} }

View file

@ -71,14 +71,8 @@ public class Book implements Comparable<Book>, Serializable {
public boolean equals(Book that) { public boolean equals(Book that) {
if (that==null) return false; if (that==null) return false;
if (this==that) return true; if (this==that) return true;
boolean e1 = this.getProduct_id().equals(that.getProduct_id()); boolean e1 = this.getProduct_id().equals(that.getProduct_id());
// boolean e2 = this.getAsin().equals(that.getAsin());
// assert (e1 == e2);
return e1; return e1;
} }
public boolean isOK() { public boolean isOK() {

View file

@ -142,16 +142,6 @@ public class BookInfoPanel extends GridComposite implements BookListener {
return c; return c;
} }
private void updateTask(Book b) {
String t = "";
if (curBook != null) {
}
task.setText(t);
}
private void update(Book b) { private void update(Book b) {
curBook = b; curBook = b;
@ -311,7 +301,6 @@ public class BookInfoPanel extends GridComposite implements BookListener {
task.setText(msg); task.setText(msg);
} }
}); });
} }
} }