program Amazon_Books; const cMaxCategory = 5; // total number of categories; set to 10; cMaxBooksPerCat = 500; // how many books to extract per category; set to 500 cMaxBooks = 5000; // maximum number of books to extract cOutputFileBooks = 'C:\AmazonBooks.csv'; cReviewInputFile = 'C:\AmazonTitleURL.csv'; type BookRecord = record sDetailURL : string; sTitle : string; sListPrice : string; sPrice : string; sCustomer1 : string; sCustomer2 : string; sCustomer3 : string; sCustomer4 : string; sCustomer5 : string; sPublishDate : string; sCustomerReview : string; sSalesRank : string; end; var nCategoryCount, nBookCount, nPage, nIndex, nBooksInCategory : integer; ArrayCategoryURL: array[1..10] of string; // cMaxCategory ArrayBooks: array[1..cMaxBooks] of BookRecord; function ProcessDblQuote(str: string): string; var bFlag : boolean; begin result := str; bFlag := False; if (Pos('"', str) > 0) then begin str := StringReplace(str, '"', '""', true, true); bFlag := True; end; if (Pos(#13, str) > 0) or (Pos(',', str) > 0) then bflag := True; if (bFlag) then // has double quote or #13 result := '"' + str + '"'; end; // Assign URL of the first page search reesult of each of 10 categories here, so that script can just go directly to it procedure InitCategoryURL; begin // 1) Arts & Photography->General ArrayCategoryURL[1] := 'http://www.amazon.com/General-Art-Arts-Photography-Books/b/ref=amb_link_5/104-6764641-2666339?%5Fencoding=UTF8&node=779552'; // 2) Business & Investing->General ArrayCategoryURL[2] := 'http://www.amazon.com/General-Business-Investing-Books/b/ref=amb_link_9/104-6764641-2666339?%5Fencoding=UTF8&node=2612'; // 3) Children's Books->Educational ArrayCategoryURL[3] := 'http://www.amazon.com/Educational-Childrens-Books/b/ref=amb_link_9/104-6764641-2666339?%5Fencoding=UTF8&node=69800'; // 4) Literature & Fiction->General ArrayCategoryURL[4] := 'http://www.amazon.com/General-Literature-Fiction-Books/b/ref=amb_link_9/104-6764641-2666339?%5Fencoding=UTF8&node=10125'; // 5) Mystery & Thrillers->General ArrayCategoryURL[5] := 'http://www.amazon.com/General-Mystery-Thrillers-Books/b/ref=amb_link_3/104-6764641-2666339?%5Fencoding=UTF8&node=605116'; // 6) Health, Mind, & Body->General ArrayCategoryURL[6] := 'http://www.amazon.com/General-Health-Mind-Body-Books/b/ref=amb_link_11/104-6764641-2666339?%5Fencoding=UTF8&node=298657'; // 7) Nonfiction->Philosophy ArrayCategoryURL[7] := 'http://www.amazon.com/Philosophy-Nonfiction-Books/b/ref=amb_link_11/104-6764641-2666339?%5Fencoding=UTF8&node=11019'; // 8) Outdoors & Nature->Travel ArrayCategoryURL[8] := 'http://www.amazon.com/Travel-Outdoors-Nature-Books/b/ref=amb_link_15/104-6764641-2666339?%5Fencoding=UTF8&node=290127'; // 9) Reference->General ArrayCategoryURL[9] := 'http://www.amazon.com/General-Reference-Books/b/ref=amb_link_15/104-6764641-2666339?%5Fencoding=UTF8&node=408268'; // 10) Biography & Memoirs->People, A-Z ArrayCategoryURL[10] := 'http://www.amazon.com/People-Biographies-Memoirs-Books/b/ref=amb_link_10/104-6764641-2666339?%5Fencoding=UTF8&node=916928'; end; procedure GetTitleAndURL(str : string); var nPos : integer; sTitle, sUrl : string; sMarker, sUrlStart, sUrlEnd, sTitleStart, sTitleEnd : string; begin sMarker := ''; sUrlStart := ''; sTitleStart := ''; sTitleEnd := '' nPos := Pos(sMarker, str); while (nPos > 0) do begin Delete(str, 1, nPos); sTitle := ProcessDblQuote(ExtractSubStr(str, sTitleStart, sTitleEnd)); sUrl := 'http://www.amazon.com/' + ExtractSubStr(str, sUrlStart, sUrlEnd); // extract only maximum desired if (nBooksInCategory >= cMaxBooksPerCat) then Exit; nBookCount := nBookCount + 1; nBooksInCategory := nBooksInCategory + 1; ArrayBooks[nBookCount].sTitle := sTitle; ArrayBooks[nBookCount].sDetailURL := sUrl; nPos := Pos(sMarker, str); end; end; procedure GetSimilarBooks(str: string); var nPos : integer; sTitle, sTmp, sStart, sEnd : string; begin sTmp := ExtractSubStr(str, '