Web-shopping is popular and widely used nowadays; however, comparing price difference among various platforms is tedious and inefficient for customers. A customer might be interested in a particular brand but have no concrete idea what model he/she is going to buy. In this work, we will provide a suggestion for users which platform has cheapest laptops of a certain brand to help them make choices. Because of the large amount of laptop models in the market, the comparisons would become a time-consuming job. Therefore, we are interesting in the performance improvement which Hadoop can bring to us if we partition the workload into parallelism.
Comparing price difference between two web-shopping platforms, we must get the price of a certain model from one platform and then search the same model on the other one. This operation is similar to do equijoin on the same model on two websites.
However, with the help of robust search function in these websites, we can utilize it as index to facilitate our equijoin and therefore minimize the time and resource for our service.
The flow of our program is stated as follows:
- Retrieve all the URLs of a particular brand in Amazon.com and store them into a file.
- Extract the price of every model from stored URLs and eliminate unwanted models(non-laptop items).
- Compare the price difference of a model among Amazon.com, Newegg and Buy.com three web-shopping platforms and conclude the cheapest one. (Figure 1)
- Utilize MapReduce to simplify the join operation and run the program in parallel.
There are some pros and cons in our strategy:
Pros:
- Utilize the built-in search engine in each platform as indices to speed up “join”.
- Take advantages of Amazon’s well-organized category to group models of a certain brand together.
- Provide users newest product information from three well-known web-shopping platforms.
Cons:
- Due to the instability of network, the performance might be affected.
- Compared to implementing the system without Hadoop, some overhead of intranet in the system exists.
- The performance and reliability would be affected if any platform is congested or crash.
import java.io.IOException; import java.util.*; import org.apache.hadoop.fs.Path; import org.apache.hadoop.conf.*; import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.*; import java.io.*; import java.net.HttpURLConnection; import java.net.URL; public class ComparePrice { static int iCurrentPos; static int iCounting = 0; static Vector<String> vTempURL = new Vector<String>(); static Vector<String> vBrandlist = new Vector<String>(); static Vector<String> vBrandURL = new Vector<String>(); static Vector<String> vItemInfo = new Vector<String>(); static Vector<String> vOutputAllString = new Vector<String>(); static int Am; static int Buycom; static int Newegg; static int iFileCount; static String TimeString; static int iDownloadingTime; static String CurrentBrand; //--------------------------------------------------------------------------- public static String getHTML(String urlPath, String fmt) { StringBuffer total = new StringBuffer(); try { URL url = new URL(urlPath); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.connect(); InputStream inStream = connection.getInputStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(inStream,fmt)); String line=""; while ((line = reader.readLine()) !=null ) { total.append(line + "\n"); } } catch(IOException e) { e.printStackTrace(); System.out.println("ERROR: GET HTML FILE FAILS"); } int brand,brand1; brand = urlPath.indexOf("http://www.amazon.com/"); brand1 = urlPath.indexOf("-"); String brandString = urlPath.substring(brand+22, brand1); int model,model1; model = total.toString().indexOf("model number:"); model1 = total.toString().indexOf("<",model+18); String modelString = total.toString().substring(model+18, model1); int price,price1; price = total.toString().indexOf("priceLarge"); System.out.println(total.toString()); System.out.println(price); price1 = total.toString().indexOf("<",price+13); System.out.println(price1); String priceString = total.toString().substring(price+13, price1); if(priceString.contains(",")) { String[] temp = priceString.split(","); StringBuffer strBf = new StringBuffer(); for(int i=0;i<=1;i++) { strBf.append(temp[i]); } priceString = strBf.toString(); } StringBuffer output = new StringBuffer(); output.append(brandString); output.append(modelString); output.append(priceString); System.out.println(output); try { BufferedWriter out = new BufferedWriter(new FileWriter("fff.txt")); out.write(output.toString()); out.newLine(); out.close(); } catch (IOException e) { System.out.println("Exception "); } return output.toString(); } //--------------------------------------------------------------------------- public static String getURL(String HTML_Body,String strBrind, int iPos) { String brandURL=""; iPos = HTML_Body.indexOf("http://www.amazon.com/",iPos); if(HTML_Body.indexOf("\"", iPos)>0) { brandURL = HTML_Body.substring( iPos , HTML_Body.indexOf("\"", iPos) ); iCurrentPos = HTML_Body.indexOf("\"", iPos) ; vTempURL.add(brandURL); if( HTML_Body.indexOf("http://www.amazon.com/",iCurrentPos) != -1 ) getURL(HTML_Body,strBrind,iCurrentPos); } return brandURL; } //--------------------------------------------------------------------------- public static String getPageN_URL(String HTML_Body,String strBrind, int iPos) { String brandURL=""; iPos = HTML_Body.indexOf("http://www.amazon.com/",iPos); if(HTML_Body.indexOf("\"", iPos)>0) { brandURL = HTML_Body.substring( iPos , HTML_Body.indexOf("\"", iPos) ); iCurrentPos = HTML_Body.indexOf("\"", iPos) ; vTempURL.add(brandURL); if( HTML_Body.indexOf("http://www.amazon.com/",iCurrentPos) != -1 ) getPageN_URL(HTML_Body,strBrind,iCurrentPos); } return brandURL; } //--------------------------------------------------------------------------- public static String getHTMLBody(String urlPath, String fmt) { StringBuffer total = new StringBuffer(); try { URL url = new URL(urlPath); long StartTime = System.currentTimeMillis(); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.connect(); InputStream inStream = connection.getInputStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(inStream,fmt)); long ProcessTime = System.currentTimeMillis() - StartTime; iDownloadingTime += ProcessTime; String line=""; while ((line = reader.readLine()) !=null ) total.append(line + "\n"); } catch(IOException e) { total.append(""); e.printStackTrace(); System.out.println("Get HTML Error! " + urlPath); vOutputAllString.add("Get HTML Error! " + urlPath); } return total.toString(); } //--------------------------------------------------------------------------- public static void SaveToLocalFile(String sFilename, String sContext) { BufferedWriter outData; try { outData = new BufferedWriter(new FileWriter(sFilename)); outData.write(sContext); outData.close(); } catch (IOException e) { System.out.println("Exception: Save to local disk fails!"); vOutputAllString.add("Exception: Save to local disk fails!"); e.printStackTrace(); } } //--------------------------------------------------------------------------- public static String getAmazonPopPriceHTMLBody(String urlPath) { String strPirce = ""; String strFindStr = "class=\"priceLarge\""; String strHTMLbody=getHTMLBodyIE(urlPath,"utf8").toLowerCase(); SaveToLocalFile("output\\"+TimeString+".AmazonPop."+String.valueOf(iFileCount)+".txt",strHTMLbody); iFileCount++; int iPrePricePos = strHTMLbody.toString().indexOf(strFindStr.toLowerCase()) + strFindStr.length(); int iPostPricePos = 0; if(iPrePricePos != -1) { iPostPricePos = strHTMLbody.toString().indexOf( "<" ,iPrePricePos); strPirce = strHTMLbody.substring(iPrePricePos+2, iPostPricePos).trim(); } else strPirce = ""; return strPirce; } //--------------------------------------------------------------------------- public static String getHTMLBodyIE(String urlPath, String fmt) { StringBuffer total = new StringBuffer(); try { URL url = new URL(urlPath); long StartTime = System.currentTimeMillis(); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.setDoInput( true ); connection.setRequestMethod( "GET" ); connection.setRequestProperty( "Host", url.getHost() ); connection.setRequestProperty( "User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1) MSIE 7.0 IE/6.0 Gecko/20100316 Firefox/3.6.2 AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1" ); connection.setRequestProperty( "Accept", "*/*" ); connection.connect(); if(connection.getResponseCode()==200); { InputStream inStream= connection.getInputStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(inStream,fmt)); long ProcessTime = System.currentTimeMillis() - StartTime; iDownloadingTime += ProcessTime; String line=""; while ((line = reader.readLine()) !=null ) total.append(line + "\n"); } } catch(IOException e) { total.append(""); System.out.println("Get HTML Error! " + urlPath); vOutputAllString.add("Get HTML Error! " + urlPath); } return total.toString(); } //--------------------------------------------------------------------------- public static String GetNeweggInfo(String strURL) { String strSearchBody=getHTMLBodyIE(strURL,"utf8").toLowerCase(); SaveToLocalFile("output\\"+TimeString+".Newegg."+String.valueOf(iFileCount)+".txt",strSearchBody); iFileCount++; String priceString=""; String strFindStr = "\"finalprice\":"; int price = strSearchBody.toString().indexOf(strFindStr.toLowerCase()); int price1=0; if(price!=-1) { price1 = strSearchBody.indexOf(",",price + strFindStr.length()); priceString = strSearchBody.substring(price + strFindStr.length()+1, price1).trim(); } else { priceString = ""; } if(priceString.contains("\"")) priceString = priceString.replace("\"" , ""); return priceString; } //--------------------------------------------------------------------------- public static String CheckOtherPlatform(String sModel, Float fAmazonPrice) { System.out.println("Amazon: $" + fAmazonPrice); vOutputAllString.add("Amazon: $" + fAmazonPrice); Float fBuycom=0f; Float fNewegg=0f; int compareTimes=0; //Buy.com String BuycomURL = "http://mobile.buy.com/ibuy/Search.aspx?pg=0&s=" + sModel; String strSearchBody=getHTMLBodyIE(BuycomURL,"utf8").toLowerCase(); SaveToLocalFile("output\\"+TimeString+".Platform."+String.valueOf(iFileCount)+".txt",strSearchBody); iFileCount++; if(strSearchBody!="") { if(strSearchBody.indexOf("sorry")==-1) { String strFindURL = "<li class=\"prodlist\">"; if(strSearchBody.indexOf(strFindURL) != -1) { String strGetItemURL = strSearchBody.substring( strSearchBody.indexOf("<a href=\"", strSearchBody.indexOf(strFindURL))+9, strSearchBody.indexOf("\" rel=", strSearchBody.indexOf(strFindURL))); System.out.println(strGetItemURL); vOutputAllString.add(strGetItemURL); String BuyPriceString = ""; String strFindStr = "<div class='ProductPrice'>"; int price = strSearchBody.toString().indexOf(strFindStr.toLowerCase()); int price1=0; if(price!=-1) { price1 = strSearchBody.indexOf("</div>",price + strFindStr.length()); BuyPriceString = strSearchBody.substring(price + strFindStr.length()+1, price1).trim(); if(BuyPriceString.indexOf("</span>")!=-1) BuyPriceString = BuyPriceString.substring(BuyPriceString.indexOf("</span>")+9,BuyPriceString.length()); } else { BuyPriceString = ""; } if(BuyPriceString.contains(",")) BuyPriceString = BuyPriceString.replace("," , ""); if(BuyPriceString!="") { if(BuyPriceString.indexOf("(") != -1) BuyPriceString = BuyPriceString.substring(1,BuyPriceString.indexOf("(")-1); if(BuyPriceString.indexOf("price")==-1) fBuycom=Float.parseFloat(BuyPriceString.trim()); } if(BuyPriceString!="") compareTimes++; System.out.println("Buy.com: $" + BuyPriceString); vOutputAllString.add("Buy.com: $" + BuyPriceString); } } else { fBuycom = 0f; } } //End of Buy.com: //Newegg /* * You must get the product item number first. * Then, open ItemInfo4ProductDetail.js by the item number. * Finally, get the price */ String NeweggURL = "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Description=" + sModel; String strGetItemInfoFile=getHTMLBodyIE(NeweggURL,"utf8").toLowerCase(); SaveToLocalFile("output\\"+TimeString+".Platform."+String.valueOf(iFileCount)+".txt",strSearchBody); iFileCount++; String NeweggPriceString = ""; if(strGetItemInfoFile !="") { System.out.println(NeweggURL); vOutputAllString.add(NeweggURL); if(strGetItemInfoFile.indexOf("search results") != -1) { String strFindModel="model #: </b>"+sModel.toLowerCase(); if(strGetItemInfoFile.indexOf(strFindModel)!=-1) { String strSearchModel="<li><b>Model #: </b>" + sModel; strSearchModel = strSearchModel.toLowerCase(); int iModelPos = strGetItemInfoFile.indexOf(strSearchModel); int iGetPos = strGetItemInfoFile.indexOf("item #: </b>", iModelPos); String strItemNum = strGetItemInfoFile.substring(iGetPos + 12,strGetItemInfoFile.indexOf("</li>",iGetPos)); String ItemURL = "http://content.newegg.com/LandingPage/ItemInfo4ProductDetail.aspx?Item="+strItemNum; NeweggPriceString = GetNeweggInfo(ItemURL); } else NeweggPriceString = ""; } else { String tmp = "http://www.newegg.com/Product/Product.aspx?"; tmp = tmp.toLowerCase(); String strItemNum = strGetItemInfoFile.substring( strGetItemInfoFile.indexOf(tmp)+tmp.length()+5, strGetItemInfoFile.indexOf("\"",strGetItemInfoFile.indexOf(tmp))); String ItemURL = "http://content.newegg.com/LandingPage/ItemInfo4ProductDetail.aspx?Item="+strItemNum; NeweggPriceString = GetNeweggInfo(ItemURL); } } if(NeweggPriceString.contains(",")) NeweggPriceString = NeweggPriceString.replace("," , ""); if(NeweggPriceString!="" && NeweggPriceString.indexOf("ull") == -1) { fNewegg=Float.parseFloat(NeweggPriceString.trim()); } else NeweggPriceString = ""; if(NeweggPriceString!="") compareTimes++; System.out.println("Newegg: $" + NeweggPriceString); vOutputAllString.add("Newegg: $" + NeweggPriceString); //End of Newegg.com Float fMin = fAmazonPrice; String strMin = "Amazon"; if(fBuycom>300 && fBuycom<fMin) { fMin = fBuycom; strMin = "Buycom"; } if(fNewegg>300 && fNewegg<fMin) { fMin = fNewegg; strMin = "Newegg"; } if(compareTimes>0) { if(strMin == "Amazon") Am++; if(strMin == "Buycom") Buycom++; if(strMin == "Newegg") Newegg++; System.out.println("************The Lowest: $" + strMin + " : "+String.valueOf(fMin)+"************"); vOutputAllString.add("************The Lowest: $" + strMin + " : "+String.valueOf(fMin)+"************"); } String downloadTime = "Total Download Time: " + String.valueOf(iDownloadingTime); System.out.println(downloadTime); return strMin; } //--------------------------------------------------------------------------- public static String getItemInfo(String urlPath) { iCounting = iCounting + 1; System.out.println(String.valueOf(iCounting) + ". " +urlPath); int brand,brand1; String strFindString = "http://www.amazon.com/"; brand = urlPath.indexOf(strFindString); brand1 = urlPath.indexOf("-"); String brandString; if(brand!=-1) brandString = urlPath.substring(brand + strFindString.length() , brand1); else brandString = ""; int model,model1; String strHTMLbody=getHTMLBody(urlPath,"utf8").toLowerCase(); model = strHTMLbody.toString().indexOf("model number:"); model1 = strHTMLbody.toString().indexOf("<",model+18); String modelString = ""; if(model != -1) modelString = strHTMLbody.toString().substring(model+18, model1); if(model == -1) modelString = ""; int price,price1; String priceString = ""; String strFindStr = "class=\"priceLarge\""; price = strHTMLbody.toString().indexOf(strFindStr.toLowerCase()); if(price!=-1) { price1 = strHTMLbody.indexOf("<",price + strFindStr.length()); priceString = strHTMLbody.substring(price + strFindStr.length()+2, price1).trim(); } else { strFindStr = "class=\"mbcPriceCell\""; price = strHTMLbody.indexOf(strFindStr.toLowerCase()); price1 = strHTMLbody.indexOf("<",price + strFindStr.length() ); if(price !=-1 && (price1 > price+price+strFindStr.length()+2) ) priceString = strHTMLbody.toString().substring(price+strFindStr.length()+2, price1).trim(); else if(strHTMLbody.indexOf("why don't we show the price") != -1) { String tmp = "http://www.amazon.com/gp/product/du/map-popover-update.html?a="+ urlPath.substring(urlPath.lastIndexOf("/")+1,urlPath.length()) ; priceString = getAmazonPopPriceHTMLBody(tmp); } else priceString = ""; } if(priceString.contains(",")) priceString = priceString.replace("," , ""); StringBuffer output = new StringBuffer(); //Formating the string output.append("#"+urlPath+"@"+brandString.toUpperCase()+"@"+modelString.toUpperCase()+"@"+priceString+"#"); vItemInfo.add(output.toString()); modelString = modelString.toLowerCase(); try { int k = Integer.valueOf(modelString); modelString=""; } catch(Exception e) {;} if( modelString!="" && priceString!="") { System.out.println(String.valueOf(iCounting) + ". \n" +urlPath); vOutputAllString.add(String.valueOf(iCounting) + ". \n" +urlPath); Float fAmazonPrice= Float.parseFloat(priceString.trim()); if(modelString.indexOf(" ")!=-1) modelString = modelString.replace(" ", "%20"); CheckOtherPlatform(modelString,fAmazonPrice); float priceFloat = Float.parseFloat(priceString); return CheckOtherPlatform(modelString,priceFloat); } else { return "NoPrice"; } } //--------------------------------------------------------------------------- public static boolean notEmpty(String s) { return (s != null && s.length() > 0); } //--------------------------------------------------------------------------- public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> { private Text word = new Text(); private final IntWritable one = new IntWritable(1); public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(CurrentBrand + getItemInfo(tokenizer.nextToken())); output.collect(word, one); } } } //--------------------------------------------------------------------------- public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> { private Text word2 = new Text(); public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { int sum = 0; int AmazonNum = 0; while(values.hasNext()) { sum += values.next().get(); } output.collect(key, new IntWritable(sum)); } } //--------------------------------------------------------------------------- public static void main(String[] args) throws Exception { long TotalExecutionTime=0; Am = 0; long StartTime = System.currentTimeMillis(); Buycom = 0; Newegg = 0; CurrentBrand=""; vTempURL.removeAllElements(); vItemInfo.removeAllElements(); JobConf conf = new JobConf(ComparePrice.class); conf.setJobName("ComparePrice"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); CurrentBrand = new String(args[1]); JobClient.runJob(conf); long ProcessTime = System.currentTimeMillis() - StartTime; TotalExecutionTime += ProcessTime; System.out.println(TotalExecutionTime); } }