2
\$\begingroup\$

I am developing application that goes through 2 websites and gets all the articles, but my code is identical in most parts, is there a way to optimize this code actually :/ (TL and DN are the naming conventions of the news papers)

public class Main {
 public static void main(String[] args) {
 BlockingQueue<JsonObject> producer2consumer = new LinkedBlockingQueue<JsonObject>();
 FakeJsonConsumer consumer = new FakeJsonConsumer(producer2consumer);
 consumer.start();
 DNCrawler producer = new DNCrawler(producer2consumer);
 producer.start();
 }
}
public class DNSpecific {
 //public static final String[] dnSeeds = { "http://www.dn.se/ekonomi/" }; // Faster testing
 public static final String[] dnSeeds = { "http://www.dn.se/",
 "http://www.dn.se/ekonomi/",
 "http://www.dn.se/sthlm/",
 "http://www.dn.se/sport/",
 "http://www.dn.se/kultur/",
 "http://www.dn.se/ledare/",
 "http://www.dn.se/motor/"
 }; 
 public static Set<String> getSeedSet() {
 HashSet<String> seedSet = new HashSet<String>();
 for (String seed : dnSeeds)
 seedSet.add(seed); 
 return seedSet; 
 }
 private static final Pattern DN_FILTER = Pattern.compile(".*((/rss/|/webb-tv/|/fondlistor/|rm=print|/css/|/pages/)).*");
 public static boolean isRelevant(String url) {
 if (DN_FILTER.matcher(url).matches()) {
 //System.out.println("DNDrop "+url);
 return false;
 }
 else if (url.startsWith("/")) {
 System.out.println("DN local web reference? "+url);
 return false;
 }
 return url.startsWith("http://www.dn.se/");
 }
}
public class DNCrawler extends Thread {
 private final int MaxSeedPageRetry = 5;
 private final int MaxPageRetry = 20;
 private final int IterationSleep = 10; // Time between iterations in minutes
 private final int RetrySleep = 500; // Time between retries in milliseconds
 private final BlockingQueue<JsonObject> jsonQueue; // Our output channel
 public DNCrawler(BlockingQueue<JsonObject> queue) {
 super("DNCrawlerThread");
 jsonQueue = queue;
 System.out.println("Enter "+this.getClass().getName());
 }
 @Override
 public void run() {
 System.out.println("DNCrawler up and running ... ");
 System.out.println("\n************************************ ");
 System.out.println(getTime()+": Initial search started on "+getDay());
 // Setup DN specific seeds
 Set<String> seedSet = DNSpecific.getSeedSet();
 System.out.println("Seeds: " + seedSet);
 // Traverse seeds to find reachable
 HashMap<String,Document> url2doc = new HashMap<String,Document>();
 Set<String> urlSet = traverseSeeds(seedSet,url2doc,MaxSeedPageRetry);
 System.out.print("Seed Reachable DN Urls: "+ urlSet.size());
 //System.exit(-1);
 traverseReachable(urlSet,url2doc,MaxPageRetry);
 // Repeat forever
 while (true) {
 try { Thread.sleep(IterationSleep*60*1000); // Wait for next iteration
 } catch (InterruptedException e) { e.printStackTrace();} 
 System.out.println("\n"+getTime()+": New search started");
 urlSet = traverseSeeds(seedSet,url2doc,MaxSeedPageRetry);
 System.out.print("Seed Reachable DN Urls: "+ urlSet.size());
 // Find new reachable urls
 HashSet<String> newReachable = new HashSet<String>();
 for (String url : urlSet) {
 if (url2doc.get(url) == null) {
 //System.out.println("New: "+url);
 newReachable.add(url);
 }
 }
 // Fetch new reachable pages
 traverseReachable(newReachable,url2doc,MaxPageRetry);
 }
 }
 private Set<String> traverseSeeds(Set<String> seedSet, HashMap<String,Document> url2doc, int maxRetry) {
 ArrayDeque<String> toVisit = new ArrayDeque<String>(seedSet);
 HashSet<String> urlSet = new HashSet<String>();
 int maxAttempts = maxRetry + toVisit.size();
 int tryCount = 0;
 while (!toVisit.isEmpty() && tryCount<maxAttempts) {
 tryCount++;
 String seedUrl = toVisit.removeFirst();
 Document seedPage = null;
 try {
 seedPage = Jsoup.connect(seedUrl).get();
 url2doc.put(seedUrl, seedPage);
 // Find all outgoing links and add relevant links 
 Elements links = seedPage.select("a[href]");
 for (Element link : links) {
 String url = link.attr("abs:href").trim().toLowerCase();
 if ( !seedSet.contains(url) && isRelevant(url)) {
 urlSet.add(url); 
 }
 }
 }
 catch (SocketTimeoutException ex) { // Time-out ==> add to queue agqin and sleep for a while
 toVisit.add(seedUrl);
 try { Thread.sleep(RetrySleep); // Calm down!
 } catch (InterruptedException e) { e.printStackTrace();} 
 }
 catch (Exception ex) { // HTML fetch problem ==> drop (in this iteration)
 System.err.println("\t"+ex.getMessage()+" "+seedUrl);
 }
 }
 if (tryCount == maxAttempts)
 System.err.println("Failed to download all seeds within given limit ("+maxRetry+") of retries!");
 return urlSet;
 }
 private void traverseReachable(Set<String> reachableSet, HashMap<String,Document> url2doc, int maxRetry) {
 System.out.print(", New Reachable Urls: "+ reachableSet.size());
 long startTime = System.currentTimeMillis();
 ArrayDeque<String> toVisit = new ArrayDeque<String>(reachableSet);
 int maxAttempts = maxRetry + toVisit.size();
 int tryCount = 0, errorCount = 0, newCount = 0;
 while (!toVisit.isEmpty() && tryCount<maxAttempts) {
 tryCount++;
 String url = toVisit.removeFirst();
 Document page = null;
 try {
 page = Jsoup.connect(url).get();
 url2doc.put(url, page);
 if (saveIfArticle(url,page) )
 newCount++;
 }
 catch (SocketTimeoutException ex) {
 //System.out.println(tryCount+"\tTIME-OUT: "+ url);
 toVisit.add(url);
 try { Thread.sleep(RetrySleep); // Calm down!
 } catch (InterruptedException e) { e.printStackTrace();} 
 }
 catch (Exception ex) {
 //System.err.println(tryCount+"\t"+ex.getMessage()+" "+url);
 errorCount++;
 }
 }
 if (tryCount == maxAttempts)
 System.err.println("Failed to download all reachable pages within given limit ("+maxRetry+") of retries!");
 else {
 System.out.print(", New Articles: "+newCount);
 System.out.print(", Errors: "+errorCount+", Required retries: "+(tryCount-reachableSet.size()));
 long ellapsedTime = System.currentTimeMillis() - startTime;
 System.out.println(", Reachable traversal done in "+(ellapsedTime/1000)+" seconds");
 }
 }
 /*
 * Article identification and extraction 
 */
 private boolean saveIfArticle(String url, Document page) {
 // article tag ==> an article in DN
 Elements articleElements = page.getElementsByTag("article");
 if (!articleElements.isEmpty()) {
 buildArticle(url,page);
 return true;
 }
 return false;
 }
 /* Url filtering 
 * - Remove unwanted file/image references
 * - Apply DN specific url filtering
 * 
 */
 private static final Pattern FILE_FILTER = Pattern.compile(
 ".*(\\.(css|js|bmp|gif|jpe?g|png|tiff?|mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|m4v|pdf" +
 "|rm|smil|wmv|swf|wma|zip|rar|gz))$");
 private boolean isRelevant(String url) {
 if (url.length() < 1) // Remove empty urls
 return false;
 else if (FILE_FILTER.matcher(url).matches()) { // Ignore urls matching our defined set of unwanted image/file extensions.
 //System.out.println("GeneralDrop "+url);
 return false;
 }
 else
 return DNSpecific.isRelevant(url);
 }
 /*
 * Extract text and build Article/Json object,
 */
 private void buildArticle(String url, Document page) {
 long discoveryTime = System.currentTimeMillis();
 String asText = page.toString();
 try {
 ArticleTextExtractor extractor = new ArticleTextExtractor();
 JResult res = extractor.extractContent(asText);
 String title = res.getTitle();
 String text = res.getText();
 //System.out.println(title);
 System.out.println(text);
 Article a = new Article();
 a.setWebsite("http://www.dn.se/");
 a.setUrl(url);
 a.setTitle(title);
 a.setDiscoveryTime(discoveryTime);
 a.setText(text);
 //System.out.println(a);
 // Add to queue ==> forward article to consumer
 try { jsonQueue.put(a); } 
 catch (InterruptedException ex) { ex.printStackTrace();}
 } 
 catch (Exception e) {
 e.printStackTrace();
 }
 //System.exit(-1);
 }
 /*
 * Utility methods
 * 
 */
 SimpleDateFormat timeFormat = new SimpleDateFormat("HH:mm:ss");
 private String getTime() {
 Date date = new Date();
 String dateTime = timeFormat.format( date);
 return dateTime;
 }
 SimpleDateFormat dayFormat = new SimpleDateFormat("yyyy-MM-dd");
 private String getDay() {
 Date date = new Date();
 String dateTime = dayFormat.format( date);
 return dateTime;
 }
}

And here is the code for the second robot:

public class Main {
 public static void main(String[] args) {
 BlockingQueue<JsonObject> producer2consumer = new LinkedBlockingQueue<JsonObject>();
 FakeJsonConsumer consumer = new FakeJsonConsumer(producer2consumer);
 consumer.start();
 TLCrawler producer = new TLCrawler(producer2consumer);
 producer.start();
 }
}
public class TLSpecific {
 //public static final String[] dnSeeds = { "http://www.dn.se/ekonomi/" }; // Faster testing
 public static final String[] tlSeeds = { "http://www.thelocal.se/",
 "http://www.thelocal.se/page/view/national/",
 "http://www.thelocal.se/page/view/money/",
 "http://www.thelocal.se/page/view/politics/",
 "http://www.thelocal.se/page/view/society/",
 "http://www.thelocal.se/page/view/scitech/",
 "http://www.thelocal.se/page/view/education/",
 "http://www.thelocal.se/page/view/sport/",
 "http://www.thelocal.se/page/view/analysis/",
 "http://www.thelocal.se/page/view/features/",
 "http://www.thelocal.se/page/view/businesstravel2015/",
 "http://www.thelocal.se/page/view/study-at-malmo-university/"
 };
 public static Set<String> getSeedSet() {
 HashSet<String> seedSet = new HashSet<String>();
 for (String seed : tlSeeds)
 seedSet.add(seed);
 return seedSet;
 }
 private static final Pattern DN_FILTER = Pattern.compile(".*((/rss/|/webb-tv/|/fondlistor/|rm=print|/css/|/pages/)).*");
 public static boolean isRelevant(String url) {
 if (DN_FILTER.matcher(url).matches()) {
 //System.out.println("DNDrop "+url);
 return false;
 }
 else if (url.startsWith("/")) {
 System.out.println("TheLocal local web reference? "+url);
 return false;
 }
 return url.startsWith("http://www.thelocal.se/");
 }
}
public class TLCrawler extends Thread {
 private final int MaxSeedPageRetry = 5;
 private final int MaxPageRetry = 20;
 private final int IterationSleep = 10; // Time between iterations in minutes
 private final int RetrySleep = 500; // Time between retries in milliseconds
 private final BlockingQueue<JsonObject> jsonQueue; // Our output channel
 public TLCrawler(BlockingQueue<JsonObject> queue) {
 super("TLCrawlerThread");
 jsonQueue = queue;
 System.out.println("Enter "+this.getClass().getName());
 }
 @Override
 public void run() {
 System.out.println("TLCrawler up and running ... ");
 System.out.println("\n************************************ ");
 System.out.println(getTime()+": Initial search started on "+getDay());
 // Setup DN specific seeds
 Set<String> seedSet = TLSpecific.getSeedSet();
 System.out.println("Seeds: " + seedSet);
 // Traverse seeds to find reachable
 HashMap<String,Document> url2doc = new HashMap<String,Document>();
 Set<String> urlSet = traverseSeeds(seedSet,url2doc,MaxSeedPageRetry);
 System.out.print("Seed Reachable TL Urls: "+ urlSet.size());
 //System.exit(-1);
 traverseReachable(urlSet,url2doc,MaxPageRetry);
 // Repeat forever
 while (true) {
 try { Thread.sleep(IterationSleep*60*1000); // Wait for next iteration
 } catch (InterruptedException e) { e.printStackTrace();}
 System.out.println("\n"+getTime()+": New search started");
 urlSet = traverseSeeds(seedSet,url2doc,MaxSeedPageRetry);
 System.out.print("Seed Reachable TL Urls: "+ urlSet.size());
 // Find new reachable urls
 HashSet<String> newReachable = new HashSet<String>();
 for (String url : urlSet) {
 if (url2doc.get(url) == null) {
 newReachable.add(url);
 }
 }
 // Fetch new reachable pages
 traverseReachable(newReachable,url2doc,MaxPageRetry);
 }
 }
 private Set<String> traverseSeeds(Set<String> seedSet, HashMap<String,Document> url2doc, int maxRetry) {
 ArrayDeque<String> toVisit = new ArrayDeque<String>(seedSet);
 HashSet<String> urlSet = new HashSet<String>();
 int maxAttempts = maxRetry + toVisit.size();
 int tryCount = 0;
 while (!toVisit.isEmpty() && tryCount<maxAttempts) {
 tryCount++;
 String seedUrl = toVisit.removeFirst();
 Document seedPage = null;
 try {
 seedPage = Jsoup.connect(seedUrl).get();
 url2doc.put(seedUrl, seedPage);
 // Find all outgoing links and add relevant links
 Elements links = seedPage.select("a[href]");
 for (Element link : links) {
 String url = link.attr("abs:href").trim().toLowerCase();
 if ( !seedSet.contains(url) && isRelevant(url)) {
 urlSet.add(url);
 }
 }
 }
 catch (SocketTimeoutException ex) { // Time-out ==> add to queue agqin and sleep for a while
 toVisit.add(seedUrl);
 try { Thread.sleep(RetrySleep); // Calm down!
 } catch (InterruptedException e) { e.printStackTrace();}
 }
 catch (Exception ex) { // HTML fetch problem ==> drop (in this iteration)
 System.err.println("\t"+ex.getMessage()+" "+seedUrl);
 }
 }
 if (tryCount == maxAttempts)
 System.err.println("Failed to download all seeds within given limit ("+maxRetry+") of retries!");
 return urlSet;
 }
 private void traverseReachable(Set<String> reachableSet, HashMap<String,Document> url2doc, int maxRetry) {
 System.out.print(", New Reachable Urls: "+ reachableSet.size());
 long startTime = System.currentTimeMillis();
 ArrayDeque<String> toVisit = new ArrayDeque<String>(reachableSet);
 int maxAttempts = maxRetry + toVisit.size();
 int tryCount = 0, errorCount = 0, newCount = 0;
 while (!toVisit.isEmpty() && tryCount<maxAttempts) {
 tryCount++;
 String url = toVisit.removeFirst();
 Document page = null;
 try {
 page = Jsoup.connect(url).get();
 url2doc.put(url, page);
 if (saveIfArticle(url,page) )
 newCount++;
 }
 catch (SocketTimeoutException ex) {
 //System.out.println(tryCount+"\tTIME-OUT: "+ url);
 toVisit.add(url);
 try { Thread.sleep(RetrySleep); // Calm down!
 } catch (InterruptedException e) { e.printStackTrace();}
 }
 catch (Exception ex) {
 //System.err.println(tryCount+"\t"+ex.getMessage()+" "+url);
 errorCount++;
 }
 }
 if (tryCount == maxAttempts)
 System.err.println("Failed to download all reachable pages within given limit ("+maxRetry+") of retries!");
 else {
 System.out.print(", New Articles: "+newCount);
 System.out.print(", Errors: "+errorCount+", Required retries: "+(tryCount-reachableSet.size()));
 long ellapsedTime = System.currentTimeMillis() - startTime;
 System.out.println(", Reachable traversal done in "+(ellapsedTime/1000)+" seconds");
 }
 }
 private boolean saveIfArticle(String url, Document page) {
 // article tag ==> an article in DN
 Elements articleElements = page.getElementsByTag("article");
 if (!articleElements.isEmpty()) {
 buildArticle(url,page);
 return true;
 }
 return false;
 }
 private static final Pattern FILE_FILTER = Pattern.compile(
 ".*(\\.(css|js|bmp|gif|jpe?g|png|tiff?|mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|m4v|pdf" +
 "|rm|smil|wmv|swf|wma|zip|rar|gz))$");
 private boolean isRelevant(String url) {
 if (url.length() < 1) // Remove empty urls
 return false;
 else if (FILE_FILTER.matcher(url).matches()) {
 return false;
 }
 else
 return TLSpecific.isRelevant(url);
 }
 /*
 * Extract text and build Article/Json object,
 */
 private void buildArticle(String url, Document page) {
 long discoveryTime = System.currentTimeMillis();
 String asText = page.toString();
 try {
 ArticleTextExtractor extractor = new ArticleTextExtractor();
 JResult res = extractor.extractContent(asText);
 String title = res.getTitle();
 String text = res.getText();
 //System.out.println(title);
 // System.out.println(text);
 Article a = new Article();
 a.setWebsite("http://www.thelocal.se/");
 a.setUrl(url);
 a.setTitle(title);
 a.setDiscoveryTime(discoveryTime);
 a.setText(text);
 // System.out.println(a.toJsonString());
 //System.out.println(a);
 // Add to queue ==> forward article to consumer
 try { jsonQueue.put(a); }
 catch (InterruptedException ex) { ex.printStackTrace();}
 }
 catch (Exception e) {
 e.printStackTrace();
 }
 //System.exit(-1);
 }
 SimpleDateFormat timeFormat = new SimpleDateFormat("HH:mm:ss");
 private String getTime() {
 Date date = new Date();
 String dateTime = timeFormat.format( date);
 return dateTime;
 }
 SimpleDateFormat dayFormat = new SimpleDateFormat("yyyy-MM-dd");
 private String getDay() {
 Date date = new Date();
 String dateTime = dayFormat.format( date);
 return dateTime;
 }
}
asked Mar 7, 2016 at 7:33
\$\endgroup\$
4
  • \$\begingroup\$ Do they only differ in the URLs to scrap, or...? \$\endgroup\$ Commented Mar 7, 2016 at 7:57
  • \$\begingroup\$ Yes, exactly ! But the think is that each web site has some hyperlinks to its sub categories(like Sports, Art and so on) thats why i actually put them in different classes and tried to have some good OOD :/ \$\endgroup\$ Commented Mar 7, 2016 at 8:12
  • \$\begingroup\$ Are you on Java 8? \$\endgroup\$ Commented Mar 7, 2016 at 8:14
  • \$\begingroup\$ Yes, I am on Java 8 \$\endgroup\$ Commented Mar 7, 2016 at 8:16

2 Answers 2

3
\$\begingroup\$

Code duplication

From your self-identification, the non-duplicate parts of both classes largely center around the URLs that needs to be fetched. This means, if you have just one class:

public class WebSiteCrawler {
 // instance-specific variables?
 public WebSiteCrawler(/* what can go here? */) {
 // ...
 }
}

Those URLs are certainly a good candidate as constructor arguments, i.e. the /* what can go here? */ part.

You also have some System.out.println() statements that display the site name, and a root URL, so a possible implementation might be:

public class WebSiteCrawler {
 private final String siteName;
 private final String rootUrl;
 private final Set<String> seeds;
 public WebCrawler(String siteName, String rootUrl, String... seeds) {
 this.siteName = Objects.requireNonNull(siteName);
 this.rootUrl = Objects.requireNonNull(rootUrl);
 this.seeds = Arrays.stream(Objects.requireNonNull(seeds))
 .map(v -> String.join("/", rootUrl, v, ""))
 .collect(Collectors.toSet());
 }
 // converted to non-static so that rootUrl can be used
 private boolean isRelevant(String url) {
 // ...
 return url.startsWith(rootUrl);
 }
 private void buildArticle(String url, Document page) {
 // ...
 try {
 // ...
 Article a = new Article();
 a.setWebsite(rootUrl);
 // ...
 } catch (Exception e) {
 // tip: try to catch more specific Exceptions
 }
 }
}

And this is how you can create the instance for DN:

WebSiteCrawler dnCrawler = new WebSiteCrawler("DN", "http://www.dn.se", 
 "ekonomi", "sthlm", "sport",
 "kultur", "ledare", "motor");

Asynchronous multi-threaded processing

In addition to the usual way of using an ExecutorService to start tasks asynchronously, Java 8 also gives you the CompletableFuture class that automates and simplifies most of the manual handling one will need to do for both task processing and thread lifecycle management. In fact, there's already a handful of Google search results that it's worth your time paying attention to. :)

Java 8 Time APIs

Instead of the 'legacy' Date and SimpleDateFormat classes for handling date/time formatting, there's the new java.time.* APIs that you should consider using. For example:

// can be made static
private String getTime() {
 return DateTimeFormatter.ISO_LOCAL_TIME.format(LocalTime.now());
}
// can be made static
private String getDay() {
 return DateTimeFormatter.ISO_LOCAL_DATE.format(LocalDate.now());
}
answered Mar 8, 2016 at 0:55
\$\endgroup\$
3
\$\begingroup\$

Some things I encountered while briefly looking over your code – a very, very incomplete review though. ;)

Code Duplication

Duplication is the primary enemy of a well-designed system.

... says Uncle Bob.

You really have a lot of duplication and you should try to put this duplicated code into a common place. This could be a common super class which you abstracted away from your concrete implementations in this case, where you override the specifics in your respective child class. Where no state is involved you could extract code to a static method in a class of its own. It does not make much sense to have two times the same method(s) with the identical implementation(s). Try to parameterize in such cases. For different behavior consider the use of a Template Method for instance.

Constants

Instead of having final instance variables which already get initialized right away, rather use final class variables, i.e. constants (which should be written in uppercase following conventions).

private final int MaxSeedPageRetry = 5; // turns into:
private static final int MAX_SEED_PAGE_RETRY = 5;

Naming Conventions

It is a good practice to follow commonly accepted naming conventions. Hence constants should be all uppercase, as already said.

Encapsulation and Visibility

public static final String[] tlSeeds = { 
 // ...
};

As far as I can see this is not used outside of its class. If this is the case it should be declared private. It is good practice to keep the visibility as restrictive as possible and only change that in case it is really necessary.

"Refer to objects by their interfaces"

Instead of:

HashSet<String> newReachable = new HashSet<String>();

rather use:

Set<String> newReachable = new HashSet<String>();

Also see: Joshua Bloch – Effective Java, Item 52.

answered Mar 7, 2016 at 21:31
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.