[R] extracting a table from pdf file....

Jeff Newmiller jdnewm|| @end|ng |rom dcn@d@v|@@c@@u@
Sun Jan 8 11:29:19 CET 2023


Here is a start for two of them... PDF files are actually programs written in the Postscript language... there are a lot of ways to write a program to put marks on a page, so this mess is actually not as bad a result as you might have encountered.

str(IDTpdf)

DF <- IDTpdf[[1]][ -(1:2), ]
names( DF ) <- c( "SNo", "Scrip", "Symbol", "Leverage" )
DF$SNo <- as.integer( DF$SNo )
DF$Leverage <- as.numeric( DF$Leverage )
IDTpdf[[ 1 ]] <- DF

DF <- IDTpdf[[2]]
names( DF ) <- c( "SNo", "Scrip", "Symbol", "Leverage" )
DF <- rbind(
  data.frame(
    SNo = 54
  , Symbol = "DIVISLAB"
  , Scrip = "DIVIS LABORATORIES LTD"
  , Leverage = 4.5
  )
, DF
)
IDTpdf[[ 2 ]] <- DF

do.call( rbind, IDTpdf[ 1:2 ] )

On January 8, 2023 12:59:58 AM PST, akshay kulkarni <akshay_e4 using hotmail.com> wrote:
>dear members,
>                            I am extracting a pdf table into a data frame from this URL:
>
>https://www.canmoney.in/pdf/INTRADAYLEVERAGE-20220531-latest.pdf
>
>I am using extract_table() from the tabulizer package (it is archived and have installed it from github)
>
>IDTpdf <- extract_tables("https://www.canmoney.in/pdf/INTRADAYLEVERAGE-20220531-latest.pdf",output="data.frame")
>
>But IDTpdf consists of four different dfs, and I want to collapse them into one. The dput of IDTpdf:
>
>list(structure(list(SCRIPS.AVAILABLE.FOR.INTRADAY.WITH.LEVERAGES.PROVIDED.ON.THEM = c("S.No.",
>"times)", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
>"11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21",
>"22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32",
>"33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43",
>"44", "45", "46", "47", "48", "49", "50", "51", "52", "53"),
>    X = c("Scrip Name", "", "ALKEM LABORATORIES LTD.", "ATUL LTD",
>    "ABB INDIA LIMITED", "AARTI INDUSTRIES LTD", "ABBOTT INDIA LIMITED",
>    "ADITYA BIRLA CAPITAL LTD.", "ADITYA BIRLA FASHION & RT",
>    "ACC LIMITED", "ADANI ENTERPRISES LIMITED", "ADANI PORT & SEZ LTD",
>    "AMARA RAJA BATTERIES LTD.", "ASTRAL LIMITED", "AMBUJA CEMENTS LTD",
>    "ALEMBIC PHARMA LTD", "APOLLO HOSPITALS ENTER. L", "APOLLO TYRES LTD",
>    "ASHOK LEYLAND LTD", "ASIAN PAINTS LIMITED", "AU SMALL FINANCE BANK LTD",
>    "AUROBINDO PHARMA LTD", "AXIS BANK LIMITED", "BAJAJ AUTO LIMITED",
>    "BAJAJ FINSERV LTD.", "BAJAJ FINANCE LIMITED", "BALRAMPUR CHINI MILLS LTD",
>    "BANDHAN BANK LIMITED", "BANK OF BARODA", "BATA INDIA LTD",
>    "BHARAT ELECTRONICS LTD", "BERGER PAINTS (I) LTD", "BHARTI AIRTEL LIMITED",
>    "BHEL", "BIOCON LIMITED.", "BOSCH LIMITED", "BRITANNIA INDUSTRIES LTD",
>    "BIRLASOFT LIMITED", "ZYDUS LIFESCIENCES LTD", "CANARA BANK",
>    "CAN FIN HOMES LTD", "CHAMBAL FERTILIZERS LTD", "CHOLAMANDALAM IN & FIN CO",
>    "CIPLA LTD", "COAL INDIA LTD", "COFORGE LIMITED", "COLGATE PALMOLIVE LTD.",
>    "CONTAINER CORP OF IND LTD", "COROMANDEL INTERNTL. LTD",
>    "CROMPT GREA CON ELEC LTD", "CITY UNION BANK LTD", "CUMMINS INDIA LTD",
>    "DABUR INDIA LTD", "DEEPAK NITRITE LTD", "DELTA CORP LIMITED"
>    ), X.1 = c("Symbol Series", "", "ALKEM", "ATUL", "ABB", "AARTIIND",
>    "ABBOTINDIA", "ABCAPITAL", "ABFRL", "ACC", "ADANIENT", "ADANIPORTS",
>    "AMARAJABAT", "ASTRAL", "AMBUJACEM", "APLLTD", "APOLLOHOSP",
>    "APOLLOTYRE", "ASHOKLEY", "ASIANPAINT", "AUBANK", "AUROPHARMA",
>    "AXISBANK", "BAJAJ-AUTO", "BAJAJFINSV", "BAJFINANCE", "BALRAMCHIN",
>    "BANDHANBNK", "BANKBARODA", "BATAINDIA", "BEL", "BERGEPAINT",
>    "BHARTIARTL", "BHEL", "BIOCON", "BOSCHLTD", "BRITANNIA",
>    "BSOFT", "ZYDUSLIFE", "CANBK", "CANFINHOME", "CHAMBLFERT",
>    "CHOLAFIN", "CIPLA", "COALINDIA", "COFORGE", "COLPAL", "CONCOR",
>    "COROMANDEL", "CROMPTON", "CUB", "CUMMINSIND", "DABUR", "DEEPAKNTR",
>    "DELTACORP"), X.2 = c("Leverage (in", "", "4.5", "4.5", "4.5",
>    "4.5", "4.5", "4.5", "4.5", "4.5", "4", "4.5", "4.5", "4.5",
>    "4.5", "4.5", "4.5", "4.5", "4.5", "4.5", "4.5", "4.5", "4.5",
>    "4.5", "4.5", "4.5", "4", "4", "4.5", "4.5", "4.5", "4.5",
>    "4.5", "4", "4.5", "4.5", "4.5", "4", "4.5", "4", "4.5",
>    "4", "4", "4.5", "4.5", "4", "4.5", "4.5", "4.5", "4.5",
>    "4.5", "4.5", "4.5", "4.5", "4")), class = "data.frame", row.names = c(NA,
>-55L)), structure(list(X54 = 55:110, DIVI.S.LABORATORIES.LTD = c("DIXON TECHNO (INDIA) LTD",
>"DLF LIMITED", "DR. REDDY S LABORATORIES", "ESCORTS INDIA LTD",
>"EXIDE INDUSTRIES LTD", "FEDERAL BANK LTD", "FIRSTSOURCE SOLU. LTD.",
>"GAIL (INDIA) LTD", "GLENMARK PHARMACEUTICALS", "GMR INFRASTRUCTURE LTD.",
>"GUJ NAR VAL FER & CHEM L", "DALMIA BHARAT LIMITED", "GODREJ CONSUMER PRODUCTS",
>"GRANULES INDIA LIMITED", "GRASIM INDUSTRIES LTD", "GUJARAT STATE PETRO LTD",
>"GUJARAT GAS LIMITED", "HINDUSTAN AERONAUTICS LTD", "HAVELLS INDIA LIMITED",
>"HCL TECHNOLOGIES LTD", "HDFC LTD", "HDFC AMC LIMITED", "HDFC BANK LTD",
>"HDFC LIFE INS CO LTD", "HERO MOTOCORP LIMITED", "HINDALCO INDUSTRIES LTD",
>"HINDUSTAN COPPER LTD", "HONEYWELL AUTOMATION IND", "ICICI BANK LTD.",
>"ICICI LOMBARD GIC LIMITED", "ICICI PRU LIFE INS CO LTD", "IDFC LIMITED",
>"IDFC FIRST BANK LIMITED", "INDIAN ENERGY EXC LTD", "INDRAPRASTHA GAS LTD",
>"THE INDIAN HOTELS CO. LTD", "THE INDIA CEMENTS LIMITED", "INDIAMART INTERMESH LTD",
>"INTERGLOBE AVIATION LTD", "INDUSIND BANK LIMITED", "INDUS TOWERS LIMITED",
>"INFOSYS LIMITED", "INTELLECT DESIGN ARENA", "INDIAN OIL CORP LTD",
>"IPCA LABORATORIES LTD", "INDIAN RAIL TOUR CORP LTD", "ITC LTD",
>"JINDAL STEEL & POWER LTD", "JSW STEEL LIMITED", "JUBILANT FOODWORKS LTD",
>"KOTAK MAHINDRA BANK LTD", "L&T FINANCE HOLDINGS LTD", "DR. LAL PATH LABS LTD.",
>"LAURUS LABS LIMITED", "LIC HOUSING FINANCE LTD", "LARSEN & TOUBRO LTD."
>), DIVISLAB = c("DIXON", "DLF", "DRREDDY", "ESCORTS", "EXIDEIND",
>"FEDERALBNK", "FSL", "GAIL", "GLENMARK", "GMRINFRA", "GNFC",
>"DALBHARAT", "GODREJCP", "GRANULES", "GRASIM", "GSPL", "GUJGASLTD",
>"HAL", "HAVELLS", "HCLTECH", "HDFC", "HDFCAMC", "HDFCBANK", "HDFCLIFE",
>"HEROMOTOCO", "HINDALCO", "HINDCOPPER", "HONAUT", "ICICIBANK",
>"ICICIGI", "ICICIPRULI", "IDFC", "IDFCFIRSTB", "IEX", "IGL",
>"INDHOTEL", "INDIACEM", "INDIAMART", "INDIGO", "INDUSINDBK",
>"INDUSTOWER", "INFY", "INTELLECT", "IOC", "IPCALAB", "IRCTC",
>"ITC", "JINDALSTEL", "JSWSTEEL", "JUBLFOOD", "KOTAKBANK", "L&TFH",
>"LALPATHLAB", "LAURUSLABS", "LICHSGFIN", "LT"), X4.5 = c(4.5,
>4, 4.5, 4.5, 4.5, 4.5, 4, 4.5, 4.5, 4.5, 4, 4.5, 4.5, 4.5, 4.5,
>4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4, 4.5,
>4.5, 4.5, 4.5, 4, 4.5, 4.5, 4.5, 4.5, 4, 4, 4.5, 4, 4, 4.5, 4,
>4.5, 4.5, 4, 4.5, 4, 4.5, 4.5, 4.5, 4, 4.5, 4.5, 4.5, 4.5)), class = "data.frame", row.names = c(NA,
>-56L)), structure(list(X111 = 112:167, L.T.INFOTECH.LIMITED = c("L&T TECHNOLOGY SER. LTD.",
>"LUPIN LIMITED", "MAHINDRA & MAHINDRA LTD", "M&M FIN. SERVICES LTD",
>"MANAPPURAM FINANCE LTD", "MARICO LIMITED", "MARUTI SUZUKI INDIA LTD.",
>"UNITED SPIRITS LIMITED", "MULTI COMMODITY EXCHANGE", "MAX FINANCIAL SERV LTD",
>"MAHANAGAR GAS LTD.", "MINDTREE LIMITED", "MOTHERSON SUMI SYSTEMS LT",
>"MPHASIS LIMITED", "MRF LTD", "MUTHOOT FINANCE LIMITED", "NATIONAL ALUMINIUM CO LTD",
>"INFO EDGE (I) LTD", "NAVIN FLUORINE INT. LTD", "NBCC (INDIA) LIMITED",
>"NMDC LTD.", "NTPC LTD", "OBEROI REALTY LIMITED", "ORACLE FIN SERV SOFT LTD.",
>"OIL AND NATURAL GAS CORP.", "PAGE INDUSTRIES LTD", "PIRAMAL ENTERPRISES LTD",
>"PERSISTENT SYSTEMS LTD", "PETRONET LNG LIMITED", "POWER FIN CORP LTD.",
>"PIDILITE INDUSTRIES LTD", "PI INDUSTRIES LTD", "PUNJAB NATIONAL BANK",
>"POLYCAB INDIA LIMITED", "POWER GRID CORP. LTD.", "PVR LIMITED",
>"RAIN INDUSTRIES LIMITED", "THE RAMCO CEMENTS LIMITED", "RBL BANK LIMITED",
>"REC LIMITED", "RELIANCE INDUSTRIES LTD", "STEEL AUTHORITY OF INDIA",
>"SBI CARDS & PAY SER LTD", "SBI LIFE INSURANCE CO LTD", "STATE BANK OF INDIA",
>"SHREE CEMENT LIMITED", "SIEMENS LTD", "SRF LTD", "SHRIRAM TRANSPORT FIN CO.",
>"STRIDES PHARMA SCI LTD", "SUN PHARMACEUTICAL IND L", "SUN TV NETWORK LIMITED",
>"SYNGENE INTERNATIONAL LTD", "TATA CHEMICALS LTD", "TATA COMMUNICATIONS LTD",
>"TATA CONSUMER PRODUCT LTD"), LTI = c("LTTS", "LUPIN", "M&M",
>"M&MFIN", "MANAPPURAM", "MARICO", "MARUTI", "MCDOWELL-N", "MCX",
>"MFSL", "MGL", "MINDTREE", "MOTHERSUMI", "MPHASIS", "MRF", "MUTHOOTFIN",
>"NATIONALUM", "NAUKRI", "NAVINFLUOR", "NBCC", "NMDC", "NTPC",
>"OBEROIRLTY", "OFSS", "ONGC", "PAGEIND", "PEL", "PERSISTENT",
>"PETRONET", "PFC", "PIDILITIND", "PIIND", "PNB", "POLYCAB", "POWERGRID",
>"PVR", "RAIN", "RAMCOCEM", "RBLBANK", "RECLTD", "RELIANCE", "SAIL",
>"SBICARD", "SBILIFE", "SBIN", "SHREECEM", "SIEMENS", "SRF", "SRTRANSFIN",
>"STAR", "SUNPHARMA", "SUNTV", "SYNGENE", "TATACHEM", "TATACOMM",
>"TATACONSUM"), X4.5 = c(4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5,
>4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4, 4.5,
>4.5, 4.5, 4.5, 4.5, 4.5, 4, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5,
>4.5, 4.5, 4, 4.5, 4, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5,
>4.5, 4, 4, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5)), class = "data.frame", row.names = c(NA,
>-56L)), structure(list(X168 = 169:198, TATA.MOTORS.LIMITED = c("TATA POWER CO LTD",
>"TATA STEEL LIMITED", "TATA CONSULTANCY SERV LT", "TECH MAHINDRA LIMITED",
>"TITAN COMPANY LIMITED", "TORRENT PHARMACEUTICALS L", "TORRENT POWER LTD",
>"TRENT LTD", "TVS MOTOR COMPANY  LTD", "UNITED BREWERIES LTD",
>"ULTRATECH CEMENT LIMITED", "UPL LIMITED", "VEDANTA LIMITED",
>"VOLTAS LTD", "WHIRLPOOL OF INDIA LTD", "WIPRO LTD", "ZEE ENTERTAINMENT ENT LTD",
>"BALKRISHNA IND. LTD", "BHARAT FORGE LTD", "BHARAT PETROLEUM CORP  LT",
>"EICHER MOTORS LTD", "GODREJ PROPERTIES LTD", "HINDUSTAN PETROLEUM CORP",
>"JK CEMENT LIMITED", "NESTLE INDIA LIMITED", "METROPOLIS HEALTHCARE LTD",
>"HINDUSTAN UNILEVER LTD.", "VODAFONE IDEA LIMITED", "NIPPON L I A M LTD",
>"INDIABULLS HSG FIN LTD"), TATAMOTORS = c("TATAPOWER", "TATASTEEL",
>"TCS", "TECHM", "TITAN", "TORNTPHARM", "TORNTPOWER", "TRENT",
>"TVSMOTOR", "UBL", "ULTRACEMCO", "UPL", "VEDL", "VOLTAS", "WHIRLPOOL",
>"WIPRO", "ZEEL", "BALKRISIND", "BHARATFORG", "BPCL", "EICHERMOT",
>"GODREJPROP", "HINDPETRO", "JKCEMENT", "NESTLEIND", "METROPOLIS",
>"HINDUNILVR", "IDEA", "NAM-INDIA", "IBULHSGFIN"), X4 = c(4.5,
>4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5,
>4.5, 4.5, 3, 4.5, 4.5, 4.5, 4.5, 4, 4.5, 4.5, 4.5, 4.5, 4.5,
>4.5, 4.5, 3)), class = "data.frame", row.names = c(NA, -30L)))
>
>unlist(IDTpdf) is not working. It makes IDTpdf a lot messier...
>
>I want IDTpdf to be one data frame combining all the four pages in the above url...
>
>ANy help will be greatly appreciated...
>
>Thanking you,
>Yours sincerely,
>AKSHAY M KULKARNI
>
>
>	[[alternative HTML version deleted]]
>
>______________________________________________
>R-help using r-project.org mailing list -- To UNSUBSCRIBE and more, see
>https://stat.ethz.ch/mailman/listinfo/r-help
>PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
>and provide commented, minimal, self-contained, reproducible code.

-- 
Sent from my phone. Please excuse my brevity.



More information about the R-help mailing list